[llvm-branch-commits] [llvm] [AMDGPU] Enable atomic optimizer for divergent i64 and double values (PR #96934)
Vikram Hegde via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jul 2 06:06:02 PDT 2024
https://github.com/vikramRH updated https://github.com/llvm/llvm-project/pull/96934
>From fb7af14ba37f88cdf785bcc5d1c7ac308449c214 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 27 Jun 2024 07:31:01 -0400
Subject: [PATCH 1/3] [AMDGPU] Enable atomic optimizer for 64 bit divergent
values
---
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 22 +-
.../GlobalISel/global-atomic-fadd.f64.ll | 1346 +++++-
.../atomic_optimizations_global_pointer.ll | 1038 ++++-
.../atomic_optimizations_local_pointer.ll | 638 ++-
.../CodeGen/AMDGPU/global-atomic-fadd.f64.ll | 1332 +++++-
.../AMDGPU/global_atomic_optimizer_fp_rtn.ll | 504 +-
.../global_atomics_optimizer_fp_no_rtn.ll | 432 +-
.../AMDGPU/global_atomics_scan_fadd.ll | 4054 +++++++++++-----
.../AMDGPU/global_atomics_scan_fmax.ll | 2473 +++++++---
.../AMDGPU/global_atomics_scan_fmin.ll | 2473 +++++++---
.../AMDGPU/global_atomics_scan_fsub.ll | 4056 ++++++++++++-----
11 files changed, 14422 insertions(+), 3946 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index f9c978c5a0309..acddee0ba64e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
return Changed;
}
+static bool shouldOptimizeForType(Type *Ty) {
+ switch (Ty->getTypeID()) {
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ return true;
+ case Type::IntegerTyID: {
+ if (Ty->getIntegerBitWidth() == 32 || Ty->getIntegerBitWidth() == 64)
+ return true;
+ default:
+ return false;
+ }
+ }
+}
+
void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
// Early exit for unhandled address space atomic instructions.
switch (I.getPointerAddressSpace()) {
@@ -230,8 +244,7 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
- if (ValDivergent &&
- (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
+ if (ValDivergent && (!ST->hasDPP() || !shouldOptimizeForType(I.getType()))) {
return;
}
@@ -313,8 +326,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
- if (ValDivergent &&
- (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
+ if (ValDivergent && (!ST->hasDPP() || !shouldOptimizeForType(I.getType()))) {
return;
}
@@ -745,7 +757,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// of each active lane in the wavefront. This will be our new value
// which we will provide to the atomic operation.
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
- assert(TyBitWidth == 32);
+ assert(TyBitWidth == 32 || TyBitWidth == 64);
NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
{NewV, LastLaneIdx});
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
index b058ad1023e13..8ad91f001bd72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
@@ -1,249 +1,1219 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX90A,GFX90A_ITERATIVE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX90A,GFX90A_DPP %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX940,GFX940_ITERATIVE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX940,GFX940_DPP %s
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX90A_ITERATIVE: bb.1 (%ir-block.0):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.5):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.7):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], %25, [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.4.Flow:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF %35, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.5 (%ir-block.9):
+ ; GFX90A_ITERATIVE-NEXT: S_ENDPGM 0
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.6.Flow1:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeLoop:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %17, %bb.7, [[S_MOV_B]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI %22, %bb.7, [[COPY4]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY5]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY6]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY9]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY10]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]]
+ ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.8
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.8.ComputeEnd:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.7
+ ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE3]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY14]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY18]], [[COPY19]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY17]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY20]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY21]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.3
+ ;
+ ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX90A_DPP: bb.1 (%ir-block.0):
+ ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
+ ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.2 (%ir-block.5):
+ ; GFX90A_DPP-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY19]], [[S_MOV_B32_2]]
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]]
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY21]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY22]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.3 (%ir-block.31):
+ ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.4.Flow:
+ ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.5 (%ir-block.33):
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX940_ITERATIVE: bb.1 (%ir-block.0):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.5):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.7):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], %24, [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.4.Flow:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF %34, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.5 (%ir-block.9):
+ ; GFX940_ITERATIVE-NEXT: S_ENDPGM 0
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.6.Flow1:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.7.ComputeLoop:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %16, %bb.7, [[S_MOV_B]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI %21, %bb.7, [[COPY4]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY5]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY6]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_1]]
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+ ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY9]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY10]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]]
+ ; GFX940_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.8
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.8.ComputeEnd:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.7
+ ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE3]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY14]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY18]], [[COPY19]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY17]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY20]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY21]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.3
+ ;
+ ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX940_DPP: bb.1 (%ir-block.0):
+ ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
+ ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.2 (%ir-block.5):
+ ; GFX940_DPP-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY19]], [[S_MOV_B32_2]]
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]]
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX940_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY21]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY22]], implicit $exec
+ ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.3 (%ir-block.31):
+ ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.4.Flow:
+ ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.5 (%ir-block.33):
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX90A_ITERATIVE: bb.1 (%ir-block.0):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.5):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.7):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], %28, [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.9):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.8
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF %38, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY7]], 0, %27, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.5 (%ir-block.13):
+ ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %43.sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %43.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX90A_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.6.Flow:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.4, [[DEF]], %bb.1
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeLoop:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI %19, %bb.7, [[S_MOV_B]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI %18, %bb.7, [[DEF]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:vreg_64_align2 = PHI %24, %bb.7, [[COPY4]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY10]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY11]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_4]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_5]]
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_7]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_6]], $m0, [[COPY14]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_9]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_8]], $m0, [[COPY15]]
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY16]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY17]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY18]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY19]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY20]], [[V_NOT_B32_e32_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY21]], [[V_NOT_B32_e32_1]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE4]], [[COPY22]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]]
+ ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.8
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.8.ComputeEnd:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE3]], %bb.7
+ ; GFX90A_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.7
+ ; GFX90A_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY25]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE5]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY23]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY27]], [[COPY28]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY26]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY29]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY30]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.3
+ ;
+ ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX90A_DPP: bb.1 (%ir-block.0):
+ ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.2 (%ir-block.5):
+ ; GFX90A_DPP-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY19]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]]
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY21]], [[S_MOV_B32_2]]
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY22]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY23]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.3 (%ir-block.32):
+ ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.5
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.4.Flow:
+ ; GFX90A_DPP-NEXT: successors: %bb.6(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %44, %bb.5, [[DEF]], %bb.1
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.6
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.5 (%ir-block.35):
+ ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.2
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[STRICT_WWM1:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[V_MOV_B6]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]]
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY26]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.4
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.6 (%ir-block.40):
+ ; GFX90A_DPP-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec
+ ; GFX90A_DPP-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec
+ ; GFX90A_DPP-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX90A_DPP-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX940_ITERATIVE: bb.1 (%ir-block.0):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.5):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.7):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], %27, [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.9):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.8
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF %37, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY7]], 0, %26, 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.5 (%ir-block.13):
+ ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %42.sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %42.sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX940_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.6.Flow:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.4, [[DEF]], %bb.1
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.7.ComputeLoop:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI %18, %bb.7, [[S_MOV_B]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI %17, %bb.7, [[DEF]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:vreg_64_align2 = PHI %23, %bb.7, [[COPY4]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY10]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY11]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_4]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_5]]
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_7]]
+ ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_6]], $m0, [[COPY14]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_9]]
+ ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_8]], $m0, [[COPY15]]
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]]
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY16]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+ ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY17]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY18]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY19]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY20]], [[V_NOT_B32_e32_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY21]], [[V_NOT_B32_e32_1]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE4]], [[COPY22]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]]
+ ; GFX940_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.8
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.8.ComputeEnd:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE3]], %bb.7
+ ; GFX940_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.7
+ ; GFX940_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY25]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE5]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY23]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY27]], [[COPY28]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY26]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY29]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY30]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.3
+ ;
+ ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX940_DPP: bb.1 (%ir-block.0):
+ ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.2 (%ir-block.5):
+ ; GFX940_DPP-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY19]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]]
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY21]], [[S_MOV_B32_2]]
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX940_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY22]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY23]], implicit $exec
+ ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.3 (%ir-block.32):
+ ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.5
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.4.Flow:
+ ; GFX940_DPP-NEXT: successors: %bb.6(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %43, %bb.5, [[DEF]], %bb.1
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.6
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.5 (%ir-block.35):
+ ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.2
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX940_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[STRICT_WWM1:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[V_MOV_B6]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]]
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY26]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.4
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.6 (%ir-block.40):
+ ; GFX940_DPP-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX940_DPP-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec
+ ; GFX940_DPP-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec
+ ; GFX940_DPP-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX940_DPP-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret double %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index d3944d3d52d77..f0cec54691d5d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2,12 +2,12 @@
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX1264 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX1232 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232 %s
declare i32 @llvm.amdgcn.workitem.id.x()
@@ -1744,87 +1744,440 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX89-LABEL: add_i64_varying:
-; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
-; GFX89-NEXT: v_mov_b32_e32 v1, 0
-; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT: s_endpgm
+; GFX8-LABEL: add_i64_varying:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8-NEXT: s_add_u32 s4, s4, s8
+; GFX8-NEXT: v_writelane_b32 v2, s5, m0
+; GFX8-NEXT: s_addc_u32 s5, s5, s7
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execz .LBB5_4
+; GFX8-NEXT: ; %bb.3:
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s2
+; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: .LBB5_4:
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: add_i64_varying:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9-NEXT: s_add_u32 s4, s4, s8
+; GFX9-NEXT: v_writelane_b32 v2, s5, m0
+; GFX9-NEXT: s_addc_u32 s5, s5, s7
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9-NEXT: s_cbranch_execz .LBB5_4
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: .LBB5_4:
+; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
;
-; GFX10-LABEL: add_i64_varying:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s11, s7
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
-; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_mov_b32 s5, s1
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: add_i64_varying:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: add_i64_varying:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s11, s7
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_mov_b32 s5, s1
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
+; GFX1064-LABEL: add_i64_varying:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[4:5], 0
+; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1064-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1064-NEXT: s_add_u32 s4, s4, s7
+; GFX1064-NEXT: s_addc_u32 s5, s5, s8
+; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1064-NEXT: s_cbranch_execz .LBB5_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: v_mov_b32_e32 v3, s4
+; GFX1064-NEXT: v_mov_b32_e32 v4, s5
+; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_mov_b32 s8, s2
+; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: buffer_gl1_inv
+; GFX1064-NEXT: buffer_gl0_inv
+; GFX1064-NEXT: .LBB5_4:
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: add_i64_varying:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-NEXT: s_mov_b64 s[4:5], 0
+; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1032-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1032-NEXT: s_add_u32 s4, s4, s6
+; GFX1032-NEXT: s_addc_u32 s5, s5, s7
+; GFX1032-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s6, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1032-NEXT: s_cbranch_execz .LBB5_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: v_mov_b32_e32 v3, s4
+; GFX1032-NEXT: v_mov_b32_e32 v4, s5
+; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_mov_b32 s8, s2
+; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: buffer_gl1_inv
+; GFX1032-NEXT: buffer_gl0_inv
+; GFX1032-NEXT: .LBB5_4:
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: add_i64_varying:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[4:5], 0
+; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1164-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1164-NEXT: s_add_u32 s4, s4, s7
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_addc_u32 s5, s5, s8
+; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164-NEXT: s_cbranch_execz .LBB5_4
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: v_mov_b32_e32 v3, s4
+; GFX1164-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s10, -1
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_mov_b32 s8, s2
+; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: buffer_gl1_inv
+; GFX1164-NEXT: buffer_gl0_inv
+; GFX1164-NEXT: .LBB5_4:
+; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1164-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: add_i64_varying:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b64 s[4:5], 0
+; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1132-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1132-NEXT: s_add_u32 s4, s4, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_addc_u32 s5, s5, s7
+; GFX1132-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s6, exec_lo
+; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1132-NEXT: s_cbranch_execz .LBB5_4
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s10, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_mov_b32 s8, s2
+; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: buffer_gl1_inv
+; GFX1132-NEXT: buffer_gl0_inv
+; GFX1132-NEXT: .LBB5_4:
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1132-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT: s_endpgm
+;
+; GFX1264-LABEL: add_i64_varying:
+; GFX1264: ; %bb.0: ; %entry
+; GFX1264-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[4:5], 0
+; GFX1264-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1264-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1264-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1264-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1264-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1264-NEXT: v_writelane_b32 v2, s5, s10
+; GFX1264-NEXT: v_writelane_b32 v1, s4, s10
+; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1264-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1264-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1264-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1264-NEXT: s_cbranch_execz .LBB5_4
+; GFX1264-NEXT: ; %bb.3:
+; GFX1264-NEXT: v_mov_b32_e32 v3, s4
+; GFX1264-NEXT: v_mov_b32_e32 v4, s5
+; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: s_mov_b32 s8, s2
+; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1264-NEXT: s_wait_loadcnt 0x0
+; GFX1264-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264-NEXT: .LBB5_4:
+; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1264-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1264-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s2, -1
+; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264-NEXT: s_nop 0
+; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264-NEXT: s_endpgm
+;
+; GFX1232-LABEL: add_i64_varying:
+; GFX1232: ; %bb.0: ; %entry
+; GFX1232-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232-NEXT: s_mov_b64 s[4:5], 0
+; GFX1232-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1232-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1232-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1232-NEXT: s_lshl_b32 s8, 1, s3
+; GFX1232-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1232-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1232-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1232-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1232-NEXT: s_cbranch_execz .LBB5_4
+; GFX1232-NEXT: ; %bb.3:
+; GFX1232-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: s_mov_b32 s8, s2
+; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232-NEXT: s_wait_loadcnt 0x0
+; GFX1232-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232-NEXT: .LBB5_4:
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1232-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1232-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s2, -1
+; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232-NEXT: s_nop 0
+; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
@@ -3689,87 +4042,440 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX89-LABEL: sub_i64_varying:
-; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
-; GFX89-NEXT: v_mov_b32_e32 v1, 0
-; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT: s_endpgm
+; GFX8-LABEL: sub_i64_varying:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8-NEXT: s_add_u32 s4, s4, s8
+; GFX8-NEXT: v_writelane_b32 v2, s5, m0
+; GFX8-NEXT: s_addc_u32 s5, s5, s7
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execz .LBB11_4
+; GFX8-NEXT: ; %bb.3:
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s2
+; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: .LBB11_4:
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: sub_i64_varying:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9-NEXT: s_add_u32 s4, s4, s8
+; GFX9-NEXT: v_writelane_b32 v2, s5, m0
+; GFX9-NEXT: s_addc_u32 s5, s5, s7
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9-NEXT: s_cbranch_execz .LBB11_4
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: .LBB11_4:
+; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: sub_i64_varying:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[4:5], 0
+; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1064-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1064-NEXT: s_add_u32 s4, s4, s7
+; GFX1064-NEXT: s_addc_u32 s5, s5, s8
+; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1064-NEXT: s_cbranch_execz .LBB11_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: v_mov_b32_e32 v3, s4
+; GFX1064-NEXT: v_mov_b32_e32 v4, s5
+; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_mov_b32 s8, s2
+; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: buffer_gl1_inv
+; GFX1064-NEXT: buffer_gl0_inv
+; GFX1064-NEXT: .LBB11_4:
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: sub_i64_varying:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-NEXT: s_mov_b64 s[4:5], 0
+; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1032-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1032-NEXT: s_add_u32 s4, s4, s6
+; GFX1032-NEXT: s_addc_u32 s5, s5, s7
+; GFX1032-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s6, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1032-NEXT: s_cbranch_execz .LBB11_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: v_mov_b32_e32 v3, s4
+; GFX1032-NEXT: v_mov_b32_e32 v4, s5
+; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_mov_b32 s8, s2
+; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: buffer_gl1_inv
+; GFX1032-NEXT: buffer_gl0_inv
+; GFX1032-NEXT: .LBB11_4:
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: sub_i64_varying:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[4:5], 0
+; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1164-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1164-NEXT: s_add_u32 s4, s4, s7
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_addc_u32 s5, s5, s8
+; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164-NEXT: s_cbranch_execz .LBB11_4
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: v_mov_b32_e32 v3, s4
+; GFX1164-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s10, -1
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_mov_b32 s8, s2
+; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: buffer_gl1_inv
+; GFX1164-NEXT: buffer_gl0_inv
+; GFX1164-NEXT: .LBB11_4:
+; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT: s_endpgm
;
-; GFX10-LABEL: sub_i64_varying:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s11, s7
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
-; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_mov_b32 s5, s1
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: sub_i64_varying:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: sub_i64_varying:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s11, s7
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_mov_b32 s5, s1
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
+; GFX1132-LABEL: sub_i64_varying:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b64 s[4:5], 0
+; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1132-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1132-NEXT: s_add_u32 s4, s4, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_addc_u32 s5, s5, s7
+; GFX1132-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s6, exec_lo
+; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1132-NEXT: s_cbranch_execz .LBB11_4
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s10, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_mov_b32 s8, s2
+; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: buffer_gl1_inv
+; GFX1132-NEXT: buffer_gl0_inv
+; GFX1132-NEXT: .LBB11_4:
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT: s_endpgm
+;
+; GFX1264-LABEL: sub_i64_varying:
+; GFX1264: ; %bb.0: ; %entry
+; GFX1264-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[4:5], 0
+; GFX1264-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1264-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1264-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1264-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1264-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1264-NEXT: v_writelane_b32 v2, s5, s10
+; GFX1264-NEXT: v_writelane_b32 v1, s4, s10
+; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1264-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1264-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1264-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1264-NEXT: s_cbranch_execz .LBB11_4
+; GFX1264-NEXT: ; %bb.3:
+; GFX1264-NEXT: v_mov_b32_e32 v3, s4
+; GFX1264-NEXT: v_mov_b32_e32 v4, s5
+; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: s_mov_b32 s8, s2
+; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1264-NEXT: s_wait_loadcnt 0x0
+; GFX1264-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264-NEXT: .LBB11_4:
+; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1264-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s2, -1
+; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264-NEXT: s_nop 0
+; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264-NEXT: s_endpgm
+;
+; GFX1232-LABEL: sub_i64_varying:
+; GFX1232: ; %bb.0: ; %entry
+; GFX1232-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232-NEXT: s_mov_b64 s[4:5], 0
+; GFX1232-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1232-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1232-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1232-NEXT: s_lshl_b32 s8, 1, s3
+; GFX1232-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1232-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1232-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1232-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1232-NEXT: s_cbranch_execz .LBB11_4
+; GFX1232-NEXT: ; %bb.3:
+; GFX1232-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: s_mov_b32 s8, s2
+; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232-NEXT: s_wait_loadcnt 0x0
+; GFX1232-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232-NEXT: .LBB11_4:
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1232-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s2, -1
+; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232-NEXT: s_nop 0
+; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index b0b40aa952a9f..453bd07647c73 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2,10 +2,10 @@
; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
declare i32 @llvm.amdgcn.workitem.id.x()
@@ -1445,56 +1445,301 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: add_i64_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: s_mov_b64 s[2:3], 0
+; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX8-NEXT: s_mov_b32 m0, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_u32 s2, s2, s8
+; GFX8-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8-NEXT: s_addc_u32 s3, s3, s7
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execz .LBB6_4
+; GFX8-NEXT: ; %bb.3:
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
+; GFX8-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: .LBB6_4:
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX9-NEXT: s_mov_b32 m0, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_u32 s2, s2, s8
+; GFX9-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9-NEXT: s_addc_u32 s3, s3, s7
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB6_4
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: .LBB6_4:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX10-LABEL: add_i64_varying:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: add_i64_varying:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX1064-LABEL: add_i64_varying:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1064-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1064-NEXT: s_add_u32 s2, s2, s7
+; GFX1064-NEXT: s_addc_u32 s3, s3, s8
+; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT: s_cbranch_execz .LBB6_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: buffer_gl0_inv
+; GFX1064-NEXT: .LBB6_4:
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: add_i64_varying:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032-NEXT: s_mov_b64 s[2:3], 0
+; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032-NEXT: s_add_u32 s2, s2, s6
+; GFX1032-NEXT: s_addc_u32 s3, s3, s7
+; GFX1032-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1032-NEXT: s_andn2_b32 s4, s4, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032-NEXT: s_cbranch_execz .LBB6_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: buffer_gl0_inv
+; GFX1032-NEXT: .LBB6_4:
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: add_i64_varying:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[4:5]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1164-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1164-NEXT: s_add_u32 s2, s2, s7
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_addc_u32 s3, s3, s8
+; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT: s_cbranch_execz .LBB6_4
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: buffer_gl0_inv
+; GFX1164-NEXT: .LBB6_4:
+; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1164-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: add_i64_varying:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b64 s[2:3], 0
+; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132-NEXT: s_add_u32 s2, s2, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_addc_u32 s3, s3, s7
+; GFX1132-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1132-NEXT: s_and_not1_b32 s4, s4, s5
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132-NEXT: s_cbranch_execz .LBB6_4
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: buffer_gl0_inv
+; GFX1132-NEXT: .LBB6_4:
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1132-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
@@ -2972,56 +3217,301 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: sub_i64_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: s_mov_b64 s[2:3], 0
+; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX8-NEXT: s_mov_b32 m0, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_u32 s2, s2, s8
+; GFX8-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8-NEXT: s_addc_u32 s3, s3, s7
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execz .LBB13_4
+; GFX8-NEXT: ; %bb.3:
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
+; GFX8-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: .LBB13_4:
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX9-NEXT: s_mov_b32 m0, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_u32 s2, s2, s8
+; GFX9-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9-NEXT: s_addc_u32 s3, s3, s7
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB13_4
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: .LBB13_4:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX10-LABEL: sub_i64_varying:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: sub_i64_varying:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX1064-LABEL: sub_i64_varying:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1064-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1064-NEXT: s_add_u32 s2, s2, s7
+; GFX1064-NEXT: s_addc_u32 s3, s3, s8
+; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT: s_cbranch_execz .LBB13_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: buffer_gl0_inv
+; GFX1064-NEXT: .LBB13_4:
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: sub_i64_varying:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032-NEXT: s_mov_b64 s[2:3], 0
+; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032-NEXT: s_add_u32 s2, s2, s6
+; GFX1032-NEXT: s_addc_u32 s3, s3, s7
+; GFX1032-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1032-NEXT: s_andn2_b32 s4, s4, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032-NEXT: s_cbranch_execz .LBB13_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: buffer_gl0_inv
+; GFX1032-NEXT: .LBB13_4:
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: sub_i64_varying:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[4:5]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1164-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1164-NEXT: s_add_u32 s2, s2, s7
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_addc_u32 s3, s3, s8
+; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT: s_cbranch_execz .LBB13_4
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: buffer_gl0_inv
+; GFX1164-NEXT: .LBB13_4:
+; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: sub_i64_varying:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b64 s[2:3], 0
+; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132-NEXT: s_add_u32 s2, s2, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_addc_u32 s3, s3, s7
+; GFX1132-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1132-NEXT: s_and_not1_b32 s4, s4, s5
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132-NEXT: s_cbranch_execz .LBB13_4
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: buffer_gl0_inv
+; GFX1132-NEXT: .LBB13_4:
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
index 9d8b987d2ba68..60149b90cb048 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
@@ -1,255 +1,1199 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX90A,GFX90A_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX90A,GFX90A_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX940,GFX940_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX940,GFX940_DPP %s
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX90A-NEXT: $sgpr0 = COPY [[COPY5]]
+ ; GFX90A-NEXT: $sgpr1 = COPY [[COPY6]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
+ ; GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX90A-NEXT: $sgpr0 = COPY [[COPY5]]
+ ; GFX90A-NEXT: $sgpr1 = COPY [[COPY6]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
+ ; GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX90A_ITERATIVE: bb.0 (%ir-block.0):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.1 (%ir-block.5):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY6]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.7):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY %42
+ ; GFX90A_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY8]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.3.Flow:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.9):
+ ; GFX90A_ITERATIVE-NEXT: S_ENDPGM 0
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.5.Flow1:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.4
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.6.ComputeLoop:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %26, %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[COPY7]], %bb.1, %5, %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY9]], [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE2]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+ ; GFX90A_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc
+ ; GFX90A_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI1]], killed [[S_LSHL_B64_]], implicit-def dead $scc
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX90A_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc
+ ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeEnd:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY13]], [[COPY14]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY12]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ;
+ ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX90A_DPP: bb.0 (%ir-block.0):
+ ; GFX90A_DPP-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.1 (%ir-block.5):
+ ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1
+ ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY8]], [[COPY9]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY7]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: early-clobber %1:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.2 (%ir-block.31):
+ ; GFX90A_DPP-NEXT: successors: %bb.3(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY %1
+ ; GFX90A_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY12]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.3.Flow:
+ ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.4 (%ir-block.33):
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX940_ITERATIVE: bb.0 (%ir-block.0):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.1 (%ir-block.5):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY6]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.7):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY %41
+ ; GFX940_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY8]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.3.Flow:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.9):
+ ; GFX940_ITERATIVE-NEXT: S_ENDPGM 0
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.5.Flow1:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.4
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.6.ComputeLoop:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %25, %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[COPY7]], %bb.1, %5, %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY9]], [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE2]]
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+ ; GFX940_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc
+ ; GFX940_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI1]], killed [[S_LSHL_B64_]], implicit-def dead $scc
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX940_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc
+ ; GFX940_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.7.ComputeEnd:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY13]], [[COPY14]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY12]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ;
+ ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX940_DPP: bb.0 (%ir-block.0):
+ ; GFX940_DPP-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000)
+ ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.1 (%ir-block.5):
+ ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1
+ ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY8]], [[COPY9]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY7]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX940_DPP-NEXT: early-clobber %1:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.2 (%ir-block.31):
+ ; GFX940_DPP-NEXT: successors: %bb.3(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY %1
+ ; GFX940_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY12]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.3.Flow:
+ ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.4 (%ir-block.33):
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX90A_ITERATIVE: bb.0 (%ir-block.0):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX90A_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]]
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.1 (%ir-block.5):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[COPY7]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]]
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.7):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY %68
+ ; GFX90A_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY10]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.9):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %77, %bb.7, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE2]], 0, %12, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.13):
+ ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY %5.sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY %5.sub1
+ ; GFX90A_ITERATIVE-NEXT: $sgpr0 = COPY [[COPY13]]
+ ; GFX90A_ITERATIVE-NEXT: $sgpr1 = COPY [[COPY14]]
+ ; GFX90A_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.5.Flow:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, [[V_ADD_F64_e64_]], %bb.3
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.4
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.6.ComputeLoop:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %42, %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI [[COPY9]], %bb.1, %9, %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:sreg_64 = PHI [[COPY8]], %bb.1, %11, %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI4]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY15]], [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY16]], [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
+ ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY18]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_2]], $m0, [[COPY17]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
+ ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY20]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_3]], $m0, [[COPY19]]
+ ; GFX90A_ITERATIVE-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_1]], %subreg.sub0, [[V_WRITELANE_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE3]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY22]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+ ; GFX90A_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc
+ ; GFX90A_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI4]], killed [[S_LSHL_B64_]], implicit-def dead $scc
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX90A_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc
+ ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeEnd:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[COPY21]], %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY24]], [[COPY25]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY23]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[DEF7]]
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ;
+ ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX90A_DPP: bb.0 (%ir-block.0):
+ ; GFX90A_DPP-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+ ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX90A_DPP-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]]
+ ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.1 (%ir-block.5):
+ ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
+ ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY8]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY12]], [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: early-clobber %2:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]]
+ ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.2 (%ir-block.32):
+ ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY %2
+ ; GFX90A_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY14]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.4
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.3.Flow:
+ ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, %7, %bb.4
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.5
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.4 (%ir-block.35):
+ ; GFX90A_DPP-NEXT: successors: %bb.3(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY13]], %bb.1, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY16]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: early-clobber %55:vreg_64_align2 = STRICT_WWM [[V_MOV_B7]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE3]], 0, killed %55, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.5 (%ir-block.40):
+ ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX90A_DPP-NEXT: $sgpr0 = COPY [[COPY17]]
+ ; GFX90A_DPP-NEXT: $sgpr1 = COPY [[COPY18]]
+ ; GFX90A_DPP-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX940_ITERATIVE: bb.0 (%ir-block.0):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX940_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]]
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.1 (%ir-block.5):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[COPY7]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]]
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.7):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY %67
+ ; GFX940_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY10]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.9):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %76, %bb.7, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE2]], 0, %12, 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.13):
+ ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY %5.sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY %5.sub1
+ ; GFX940_ITERATIVE-NEXT: $sgpr0 = COPY [[COPY13]]
+ ; GFX940_ITERATIVE-NEXT: $sgpr1 = COPY [[COPY14]]
+ ; GFX940_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.5.Flow:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, [[V_ADD_F64_e64_]], %bb.3
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.4
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.6.ComputeLoop:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %41, %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI [[COPY9]], %bb.1, %9, %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:sreg_64 = PHI [[COPY8]], %bb.1, %11, %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI4]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY15]], [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY16]], [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
+ ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY18]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_2]], $m0, [[COPY17]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
+ ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY20]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_3]], $m0, [[COPY19]]
+ ; GFX940_ITERATIVE-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_1]], %subreg.sub0, [[V_WRITELANE_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE3]]
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY22]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+ ; GFX940_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc
+ ; GFX940_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI4]], killed [[S_LSHL_B64_]], implicit-def dead $scc
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX940_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc
+ ; GFX940_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.7.ComputeEnd:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[COPY21]], %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY24]], [[COPY25]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY23]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[DEF7]]
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ;
+ ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX940_DPP: bb.0 (%ir-block.0):
+ ; GFX940_DPP-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+ ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX940_DPP-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]]
+ ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.1 (%ir-block.5):
+ ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
+ ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY8]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY12]], [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX940_DPP-NEXT: early-clobber %2:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]]
+ ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.2 (%ir-block.32):
+ ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY %2
+ ; GFX940_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY14]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.4
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.3.Flow:
+ ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, %7, %bb.4
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.5
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.4 (%ir-block.35):
+ ; GFX940_DPP-NEXT: successors: %bb.3(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY13]], %bb.1, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY16]], implicit $exec
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; GFX940_DPP-NEXT: early-clobber %54:vreg_64_align2 = STRICT_WWM [[V_MOV_B7]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE3]], 0, killed %54, 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.5 (%ir-block.40):
+ ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX940_DPP-NEXT: $sgpr0 = COPY [[COPY17]]
+ ; GFX940_DPP-NEXT: $sgpr1 = COPY [[COPY18]]
+ ; GFX940_DPP-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret double %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
index 2397d6c4e8938..166865b9b866f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -990,9 +990,87 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_s
}
define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]])
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = fadd double [[TMP14]], [[TMP21:%.*]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]])
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]])
+; IR-ITERATIVE-NEXT: [[TMP22]] = fadd double [[ACCUMULATOR]], [[TMP20]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = fadd double [[TMP9]], [[TMP10]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = fadd double [[TMP11]], [[TMP12]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = fadd double [[TMP13]], [[TMP14]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = fadd double [[TMP15]], [[TMP16]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = fadd double [[TMP17]], [[TMP18]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = fadd double [[TMP19]], [[TMP20]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]])
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]])
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]])
+; IR-DPP-NEXT: [[TMP32:%.*]] = fadd double [[TMP30]], [[TMP31]]
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
ret double %result
@@ -1064,9 +1142,87 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
}
define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("one-as") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("one-as") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
ret double %result
@@ -1138,9 +1294,87 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s
}
define amdgpu_ps double @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
-; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret double %result
@@ -1178,9 +1412,87 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_s
}
define amdgpu_ps double @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
-; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]])
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP14]], double [[TMP21:%.*]])
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]])
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]])
+; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.minnum.f64(double [[ACCUMULATOR]], double [[TMP20]])
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF0000000000000)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.minnum.f64(double [[TMP9]], double [[TMP10]])
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.minnum.f64(double [[TMP11]], double [[TMP12]])
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP13]], double [[TMP14]])
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP15]], double [[TMP16]])
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.minnum.f64(double [[TMP17]], double [[TMP18]])
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.minnum.f64(double [[TMP19]], double [[TMP20]])
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP21]], i32 312, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]])
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]])
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]])
+; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.minnum.f64(double [[TMP30]], double [[TMP31]])
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret double %result
@@ -1244,9 +1556,87 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
}
define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{
-; IR-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.maxnum.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0xFFF0000000000000) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP9]], double [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP11]], double [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP13]], double [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP15]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP17]], double [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP19]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP30]], double [[TMP31]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret double %result
@@ -1318,9 +1708,87 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_
}
define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
ret double %result
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
index d6edba001fb13..8da8a9e9d3c61 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -852,9 +852,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_agent_sco
}
define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT: [[TMP17]] = fadd double [[ACCUMULATOR]], [[TMP16]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = fadd double [[TMP9]], [[TMP10]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = fadd double [[TMP11]], [[TMP12]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = fadd double [[TMP13]], [[TMP14]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = fadd double [[TMP15]], [[TMP16]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = fadd double [[TMP17]], [[TMP18]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = fadd double [[TMP19]], [[TMP20]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]])
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
ret void
@@ -914,9 +980,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_sc
}
define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("one-as") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("one-as") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
ret void
@@ -976,9 +1108,75 @@ define amdgpu_ps void @global_atomic_fsub_double_uni_address_uni_value_agent_sco
}
define amdgpu_ps void @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
-; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret void
@@ -1010,9 +1208,75 @@ define amdgpu_ps void @global_atomic_fmin_double_uni_address_uni_value_agent_sco
}
define amdgpu_ps void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
-; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.minnum.f64(double [[ACCUMULATOR]], double [[TMP16]])
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF0000000000000)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.minnum.f64(double [[TMP9]], double [[TMP10]])
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.minnum.f64(double [[TMP11]], double [[TMP12]])
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP13]], double [[TMP14]])
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP15]], double [[TMP16]])
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.minnum.f64(double [[TMP17]], double [[TMP18]])
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.minnum.f64(double [[TMP19]], double [[TMP20]])
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]])
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret void
@@ -1064,9 +1328,75 @@ define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_sco
}
define amdgpu_ps void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{
-; IR-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.maxnum.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0xFFF0000000000000) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP9]], double [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP11]], double [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP13]], double [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP15]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP17]], double [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP19]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret void
@@ -1126,9 +1456,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_system_sc
}
define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 04df04a5c299b..b6990c8b842fd 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -5873,10 +5873,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -5891,24 +5891,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB10_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB10_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB10_4
+; GFX9-NEXT: .LBB10_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -5918,43 +5941,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB10_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1064-NEXT: .LBB10_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -5964,115 +6010,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB10_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1032-NEXT: .LBB10_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB10_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1164-NEXT: .LBB10_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB10_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1132-NEXT: .LBB10_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6083,10 +6204,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -6101,24 +6222,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6128,43 +6308,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1064-DPP-NEXT: .LBB10_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6174,115 +6404,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1164-DPP-NEXT: .LBB10_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1132-DPP-NEXT: .LBB10_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
@@ -6820,10 +7206,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -6838,24 +7224,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB12_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB12_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB12_4
+; GFX9-NEXT: .LBB12_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -6865,43 +7274,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB12_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1064-NEXT: .LBB12_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -6911,115 +7343,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB12_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1032-NEXT: .LBB12_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB12_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1164-NEXT: .LBB12_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB12_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1132-NEXT: .LBB12_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7030,10 +7537,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7048,24 +7555,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7075,161 +7641,367 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1064-DPP-NEXT: s_endpgm
-;
-; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
-; GFX1032-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1064-DPP-NEXT: .LBB12_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
+; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1032-DPP-NEXT: .LBB12_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1164-DPP-NEXT: .LBB12_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1132-DPP-NEXT: .LBB12_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value() strictfp
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
@@ -7767,10 +8539,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7785,24 +8557,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB14_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB14_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB14_4
+; GFX9-NEXT: .LBB14_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
@@ -7812,43 +8607,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB14_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1064-NEXT: .LBB14_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
@@ -7858,129 +8676,204 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB14_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1032-NEXT: .LBB14_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB14_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1164-NEXT: .LBB14_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_endpgm
-;
-; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
-; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s38, -1
-; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
-; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB14_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1132-NEXT: .LBB14_5:
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7995,24 +8888,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX9-DPP-NEXT: .LBB14_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
@@ -8022,43 +8974,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1064-DPP-NEXT: .LBB14_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
@@ -8068,115 +9070,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1032-DPP-NEXT: .LBB14_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1164-DPP-NEXT: .LBB14_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1132-DPP-NEXT: .LBB14_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
@@ -8245,10 +9403,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -8263,24 +9421,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB15_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB15_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB15_4
+; GFX9-NEXT: .LBB15_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8290,43 +9471,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB15_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1064-NEXT: .LBB15_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8336,115 +9540,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1032-NEXT: s_endpgm
-;
-; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
-; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB15_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1032-NEXT: .LBB15_5:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB15_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1164-NEXT: .LBB15_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB15_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1132-NEXT: .LBB15_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8455,10 +9734,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -8473,24 +9752,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX9-DPP-NEXT: .LBB15_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8500,43 +9838,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1064-DPP-NEXT: .LBB15_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8546,115 +9934,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1032-DPP-NEXT: .LBB15_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1164-DPP-NEXT: .LBB15_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1132-DPP-NEXT: .LBB15_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value() strictfp
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
@@ -9192,10 +10736,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -9210,26 +10754,49 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB17_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB17_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_endpgm
-;
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB17_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB17_4
+; GFX9-NEXT: .LBB17_5:
+; GFX9-NEXT: s_endpgm
+;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -9237,43 +10804,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB17_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1064-NEXT: .LBB17_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
@@ -9283,115 +10873,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB17_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1032-NEXT: .LBB17_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB17_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1164-NEXT: .LBB17_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB17_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1132-NEXT: .LBB17_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
@@ -9402,10 +11067,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -9420,24 +11085,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX9-DPP-NEXT: .LBB17_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
@@ -9447,43 +11171,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1064-DPP-NEXT: .LBB17_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
@@ -9493,115 +11267,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1032-DPP-NEXT: .LBB17_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1164-DPP-NEXT: .LBB17_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1132-DPP-NEXT: .LBB17_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value() strictfp
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 9f27314cc3909..f512f17bbbcbf 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -3695,10 +3695,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -3713,26 +3713,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX9-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB7_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB7_4
+; GFX9-NEXT: .LBB7_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -3742,29 +3767,55 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT: .LBB7_4:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -3774,107 +3825,191 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT: .LBB7_4:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB7_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB7_4
+; GFX1164-NEXT: .LBB7_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB7_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB7_4
+; GFX1132-NEXT: .LBB7_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -3885,10 +4020,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -3903,26 +4038,92 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -3932,29 +4133,88 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB7_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -3964,115 +4224,296 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB7_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-DPP-NEXT: s_endpgm
- %divValue = call double @div.double.value()
- %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-DPP-NEXT: .LBB7_3:
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value()
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
@@ -4490,10 +4931,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -4508,26 +4949,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX9-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB9_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB9_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB9_4
+; GFX9-NEXT: .LBB9_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4537,45 +5003,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB9_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1064-NEXT: .LBB9_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4585,123 +5076,206 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB9_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1032-NEXT: .LBB9_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB9_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1164-NEXT: .LBB9_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB9_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1132-NEXT: .LBB9_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4712,10 +5286,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -4730,26 +5304,92 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4759,45 +5399,103 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12]
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[13:14], v[9:10]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4807,123 +5505,303 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
@@ -5349,10 +6227,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -5367,26 +6245,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX9-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB11_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB11_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB11_4
+; GFX9-NEXT: .LBB11_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
@@ -5396,45 +6299,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB11_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1064-NEXT: .LBB11_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
@@ -5444,123 +6372,206 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB11_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1032-NEXT: .LBB11_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB11_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1164-NEXT: .LBB11_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB11_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1132-NEXT: .LBB11_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
@@ -5571,10 +6582,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -5589,26 +6600,92 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
@@ -5618,45 +6695,103 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12]
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[13:14], v[9:10]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
@@ -5666,123 +6801,303 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1132-DPP-NEXT: .LBB11_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index f16f61159fc30..c3b3079db3adc 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -3695,10 +3695,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -3713,26 +3713,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX9-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB7_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB7_4
+; GFX9-NEXT: .LBB7_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -3742,29 +3767,55 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT: .LBB7_4:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -3774,107 +3825,191 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT: .LBB7_4:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB7_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB7_4
+; GFX1164-NEXT: .LBB7_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB7_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB7_4
+; GFX1132-NEXT: .LBB7_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -3885,10 +4020,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -3903,26 +4038,92 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -3932,29 +4133,88 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB7_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -3964,115 +4224,296 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB7_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-DPP-NEXT: s_endpgm
- %divValue = call double @div.double.value()
- %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-DPP-NEXT: .LBB7_3:
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value()
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
@@ -4490,10 +4931,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -4508,26 +4949,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX9-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB9_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB9_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB9_4
+; GFX9-NEXT: .LBB9_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4537,45 +5003,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB9_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1064-NEXT: .LBB9_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4585,123 +5076,206 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB9_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1032-NEXT: .LBB9_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB9_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1164-NEXT: .LBB9_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB9_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1132-NEXT: .LBB9_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4712,10 +5286,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -4730,26 +5304,92 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4759,45 +5399,103 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12]
+; GFX1064-DPP-NEXT: v_min_f64 v[9:10], v[13:14], v[9:10]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4807,123 +5505,303 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX1032-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
@@ -5349,10 +6227,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -5367,26 +6245,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX9-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB11_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB11_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB11_4
+; GFX9-NEXT: .LBB11_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
@@ -5396,45 +6299,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB11_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1064-NEXT: .LBB11_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
@@ -5444,123 +6372,206 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB11_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1032-NEXT: .LBB11_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB11_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1164-NEXT: .LBB11_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB11_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1132-NEXT: .LBB11_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
@@ -5571,10 +6582,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -5589,26 +6600,92 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
@@ -5618,45 +6695,103 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12]
+; GFX1064-DPP-NEXT: v_min_f64 v[9:10], v[13:14], v[9:10]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
@@ -5666,123 +6801,303 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX1032-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1132-DPP-NEXT: .LBB11_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 64650e2733a00..8664fdf242036 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -6081,10 +6081,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -6099,24 +6099,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB10_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB10_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB10_4
+; GFX9-NEXT: .LBB10_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6126,43 +6149,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB10_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1064-NEXT: .LBB10_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6172,115 +6218,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB10_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1032-NEXT: .LBB10_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB10_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1164-NEXT: .LBB10_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB10_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1132-NEXT: .LBB10_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6291,10 +6412,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -6309,24 +6430,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6336,43 +6516,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1064-DPP-NEXT: .LBB10_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6382,115 +6612,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1164-DPP-NEXT: .LBB10_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1132-DPP-NEXT: .LBB10_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value()
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
@@ -7027,10 +7413,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7045,24 +7431,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB12_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB12_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB12_4
+; GFX9-NEXT: .LBB12_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7072,43 +7481,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB12_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1064-NEXT: .LBB12_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7118,115 +7550,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB12_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1032-NEXT: .LBB12_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB12_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1164-NEXT: .LBB12_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB12_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1132-NEXT: .LBB12_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7237,10 +7744,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7255,24 +7762,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7282,161 +7848,367 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1064-DPP-NEXT: s_endpgm
-;
-; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
-; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1064-DPP-NEXT: .LBB12_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1032-DPP-NEXT: .LBB12_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1164-DPP-NEXT: .LBB12_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1132-DPP-NEXT: .LBB12_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value() strictfp
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
@@ -7974,10 +8746,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7992,24 +8764,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB14_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB14_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB14_4
+; GFX9-NEXT: .LBB14_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
@@ -8019,43 +8814,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB14_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1064-NEXT: .LBB14_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
@@ -8065,129 +8883,204 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB14_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1032-NEXT: .LBB14_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB14_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1164-NEXT: .LBB14_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_endpgm
-;
-; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
-; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s38, -1
-; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB14_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1132-NEXT: .LBB14_5:
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -8202,24 +9095,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX9-DPP-NEXT: .LBB14_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
@@ -8229,43 +9181,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1064-DPP-NEXT: .LBB14_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
@@ -8275,115 +9277,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1032-DPP-NEXT: .LBB14_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1164-DPP-NEXT: .LBB14_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1132-DPP-NEXT: .LBB14_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
@@ -8452,10 +9610,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -8470,24 +9628,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB15_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB15_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB15_4
+; GFX9-NEXT: .LBB15_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8497,43 +9678,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB15_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1064-NEXT: .LBB15_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8543,115 +9747,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1032-NEXT: s_endpgm
-;
-; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
-; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB15_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1032-NEXT: .LBB15_5:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB15_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1164-NEXT: .LBB15_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB15_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1132-NEXT: .LBB15_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8662,10 +9941,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -8680,24 +9959,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX9-DPP-NEXT: .LBB15_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8707,43 +10045,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1064-DPP-NEXT: .LBB15_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8753,115 +10141,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1032-DPP-NEXT: .LBB15_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1164-DPP-NEXT: .LBB15_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1132-DPP-NEXT: .LBB15_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value() strictfp
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
@@ -9398,10 +10942,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -9416,24 +10960,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB17_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB17_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB17_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB17_4
+; GFX9-NEXT: .LBB17_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
@@ -9443,43 +11010,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB17_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1064-NEXT: .LBB17_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
@@ -9489,115 +11079,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB17_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1032-NEXT: .LBB17_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB17_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1164-NEXT: .LBB17_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB17_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1132-NEXT: .LBB17_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
@@ -9608,10 +11273,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -9626,24 +11291,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX9-DPP-NEXT: .LBB17_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
@@ -9653,43 +11377,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1064-DPP-NEXT: .LBB17_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
@@ -9699,115 +11473,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1032-DPP-NEXT: .LBB17_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1164-DPP-NEXT: .LBB17_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1132-DPP-NEXT: .LBB17_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value() strictfp
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 8
>From 8b8b6c89a117175829ae1924a25c1bf6f5958c09 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 1 Jul 2024 04:01:59 -0400
Subject: [PATCH 2/3] review comments
---
llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index acddee0ba64e3..58ac9e444a51b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -178,18 +178,19 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
return Changed;
}
-static bool shouldOptimizeForType(Type *Ty) {
+static bool isOptimizableAtomic(Type *Ty) {
switch (Ty->getTypeID()) {
case Type::FloatTyID:
case Type::DoubleTyID:
return true;
case Type::IntegerTyID: {
- if (Ty->getIntegerBitWidth() == 32 || Ty->getIntegerBitWidth() == 64)
+ unsigned size = Ty->getIntegerBitWidth();
+ if (size == 32 || size == 64)
return true;
+ }
default:
return false;
}
- }
}
void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
@@ -244,7 +245,7 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
- if (ValDivergent && (!ST->hasDPP() || !shouldOptimizeForType(I.getType()))) {
+ if (ValDivergent && (!ST->hasDPP() || !isOptimizableAtomic(I.getType()))) {
return;
}
@@ -326,7 +327,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
- if (ValDivergent && (!ST->hasDPP() || !shouldOptimizeForType(I.getType()))) {
+ if (ValDivergent && (!ST->hasDPP() || !isOptimizableAtomic(I.getType()))) {
return;
}
>From 88cebe153c0ed0b66390e9e8aa828e200bdb5945 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 1 Jul 2024 08:24:55 -0400
Subject: [PATCH 3/3] review comments
---
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 29 +-
.../AMDGPU/atomic_optimizations_buffer.ll | 115 +-
.../atomic_optimizations_global_pointer.ll | 6782 ++++--
.../atomic_optimizations_local_pointer.ll | 19535 +++++++++++++---
.../AMDGPU/atomic_optimizations_raw_buffer.ll | 74 +-
.../atomic_optimizations_struct_buffer.ll | 78 +-
.../AMDGPU/global_atomics_scan_fadd.ll | 1730 +-
.../AMDGPU/global_atomics_scan_fmax.ll | 1013 +-
.../AMDGPU/global_atomics_scan_fmin.ll | 1013 +-
.../AMDGPU/global_atomics_scan_fsub.ll | 1658 +-
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 232 +-
11 files changed, 25740 insertions(+), 6519 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 58ac9e444a51b..440b00fe22c7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -184,9 +184,8 @@ static bool isOptimizableAtomic(Type *Ty) {
case Type::DoubleTyID:
return true;
case Type::IntegerTyID: {
- unsigned size = Ty->getIntegerBitWidth();
- if (size == 32 || size == 64)
- return true;
+ unsigned Size = Ty->getIntegerBitWidth();
+ return (Size == 32 || Size == 64);
}
default:
return false;
@@ -243,10 +242,14 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
- // we have DPP available on our subtarget, and the atomic operation is 32
- // bits.
- if (ValDivergent && (!ST->hasDPP() || !isOptimizableAtomic(I.getType()))) {
- return;
+ // we have DPP available on our subtarget (for DPP strategy), and the atomic
+ // operation is 32 or 64 bits.
+ if (ValDivergent) {
+ if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+ return;
+
+ if (!isOptimizableAtomic(I.getType()))
+ return;
}
// If we get here, we can optimize the atomic using a single wavefront-wide
@@ -325,10 +328,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
- // we have DPP available on our subtarget, and the atomic operation is 32
- // bits.
- if (ValDivergent && (!ST->hasDPP() || !isOptimizableAtomic(I.getType()))) {
- return;
+ // we have DPP available on our subtarget (for DPP strategy), and the atomic
+ // operation is 32 or 64 bits.
+ if (ValDivergent) {
+ if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+ return;
+
+ if (!isOptimizableAtomic(I.getType()))
+ return;
}
// If any of the other arguments to the intrinsic are divergent, we can't
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 624101dc12c5f..cff408c8f01b4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -571,13 +571,44 @@ entry:
define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
; GFX6-LABEL: add_i32_varying_vdata:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b64 s[2:3], exec
+; GFX6-NEXT: s_mov_b32 s4, 0
+; GFX6-NEXT: ; implicit-def: $vgpr1
+; GFX6-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX6-NEXT: s_mov_b32 m0, s5
+; GFX6-NEXT: v_readlane_b32 s8, v0, s5
+; GFX6-NEXT: v_writelane_b32 v1, s4, m0
+; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX6-NEXT: s_add_i32 s4, s4, s8
+; GFX6-NEXT: s_cbranch_vccnz .LBB2_1
+; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX6-NEXT: ; implicit-def: $vgpr0
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX6-NEXT: s_cbranch_execz .LBB2_4
+; GFX6-NEXT: ; %bb.3:
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX6-NEXT: .LBB2_4:
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_readfirstlane_b32 s4, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
@@ -924,15 +955,46 @@ entry:
define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) {
; GFX6-LABEL: struct_add_i32_varying_vdata:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_load_dword s2, s[0:1], 0x11
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b64 s[2:3], exec
+; GFX6-NEXT: s_mov_b32 s4, 0
+; GFX6-NEXT: ; implicit-def: $vgpr1
+; GFX6-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX6-NEXT: s_mov_b32 m0, s5
+; GFX6-NEXT: v_readlane_b32 s8, v0, s5
+; GFX6-NEXT: v_writelane_b32 v1, s4, m0
+; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX6-NEXT: s_add_i32 s4, s4, s8
+; GFX6-NEXT: s_cbranch_vccnz .LBB3_1
+; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX6-NEXT: ; implicit-def: $vgpr0
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX6-NEXT: s_cbranch_execz .LBB3_4
+; GFX6-NEXT: ; %bb.3:
+; GFX6-NEXT: s_load_dword s5, s[0:1], 0x11
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
+; GFX6-NEXT: .LBB3_4:
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_readfirstlane_b32 s4, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
@@ -1953,13 +2015,44 @@ entry:
define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
; GFX6-LABEL: sub_i32_varying_vdata:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b64 s[2:3], exec
+; GFX6-NEXT: s_mov_b32 s4, 0
+; GFX6-NEXT: ; implicit-def: $vgpr1
+; GFX6-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX6-NEXT: s_mov_b32 m0, s5
+; GFX6-NEXT: v_readlane_b32 s8, v0, s5
+; GFX6-NEXT: v_writelane_b32 v1, s4, m0
+; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX6-NEXT: s_add_i32 s4, s4, s8
+; GFX6-NEXT: s_cbranch_vccnz .LBB7_1
+; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX6-NEXT: ; implicit-def: $vgpr0
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX6-NEXT: s_cbranch_execz .LBB7_4
+; GFX6-NEXT: ; %bb.3:
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
+; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX6-NEXT: .LBB7_4:
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_readfirstlane_b32 s4, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index f0cec54691d5d..d62ba78a039d4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1,13 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s
declare i32 @llvm.amdgcn.workitem.id.x()
@@ -46,37 +55,69 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX89-LABEL: add_i32_constant:
-; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b64 s[6:7], exec
-; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX89-NEXT: ; implicit-def: $vgpr1
-; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX89-NEXT: s_cbranch_execz .LBB0_2
-; GFX89-NEXT: ; %bb.1:
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
-; GFX89-NEXT: s_mul_i32 s2, s2, 5
-; GFX89-NEXT: s_mov_b32 s11, 0xf000
-; GFX89-NEXT: s_mov_b32 s10, -1
-; GFX89-NEXT: s_mov_b32 s9, s3
-; GFX89-NEXT: v_mov_b32_e32 v1, s2
-; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: .LBB0_2:
-; GFX89-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX89-NEXT: v_readfirstlane_b32 s4, v1
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
-; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX89-NEXT: s_endpgm
+; GFX8-LABEL: add_i32_constant:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr1
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_cbranch_execz .LBB0_2
+; GFX8-NEXT: ; %bb.1:
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s2
+; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT: s_mul_i32 s2, s2, 5
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
+; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: .LBB0_2:
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: add_i32_constant:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr1
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB0_2
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT: s_mul_i32 s2, s2, 5
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: .LBB0_2:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_constant:
; GFX1064: ; %bb.0: ; %entry
@@ -626,640 +667,1218 @@ entry:
}
define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
-; GFX7LESS-LABEL: add_i32_varying:
+; GFX7LESS_ITERATIVE-LABEL: add_i32_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s4
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1
+; GFX7LESS_ITERATIVE-NEXT: .LBB2_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s4, v1
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: add_i32_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s4
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX8_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol
+; GFX8_ITERATIVE-NEXT: .LBB2_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s4, v1
+; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: add_i32_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX9_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol
+; GFX9_ITERATIVE-NEXT: .LBB2_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: add_i32_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1064_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB2_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: add_i32_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s6
+; GFX1032_ITERATIVE-NEXT: s_add_i32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1032_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB2_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: add_i32_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
+; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB2_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: add_i32_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s6
+; GFX1132_ITERATIVE-NEXT: s_add_i32 s4, s4, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
+; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB2_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1264_ITERATIVE-LABEL: add_i32_varying:
+; GFX1264_ITERATIVE: ; %bb.0: ; %entry
+; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0
+; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264_ITERATIVE-NEXT: .LBB2_4:
+; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264_ITERATIVE-NEXT: s_nop 0
+; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1232_ITERATIVE-LABEL: add_i32_varying:
+; GFX1232_ITERATIVE: ; %bb.0: ; %entry
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3
+; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s6
+; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX1232_ITERATIVE-NEXT: ; %bb.3:
+; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
+; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232_ITERATIVE-NEXT: .LBB2_4:
+; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232_ITERATIVE-NEXT: s_nop 0
+; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: add_i32_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1
+; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6
+; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2
+; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3
+; GFX7LESS_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS_DPP-NEXT: buffer_wbinvl1
+; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0
+; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1
+; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: add_i32_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s10, -1
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: s_mov_b32 s8, s2
+; GFX8_DPP-NEXT: s_mov_b32 s9, s3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6
+; GFX8_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX8_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX8_DPP-NEXT: buffer_wbinvl1_vol
+; GFX8_DPP-NEXT: .LBB2_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: add_i32_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s10, -1
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: s_mov_b32 s8, s2
+; GFX9_DPP-NEXT: s_mov_b32 s9, s3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6
+; GFX9_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX9_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9_DPP-NEXT: buffer_wbinvl1_vol
+; GFX9_DPP-NEXT: .LBB2_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: add_i32_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX1064_DPP-NEXT: s_mov_b32 s4, s9
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1064_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1064_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl1_inv
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB2_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: add_i32_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s4, s6
+; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1032_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1032_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl1_inv
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB2_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: add_i32_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 63
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX1164_DPP-NEXT: s_mov_b32 s4, s9
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1164_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1164_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
+; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl1_inv
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB2_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: add_i32_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
+; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1132_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
+; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl1_inv
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB2_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+;
+; GFX1264_DPP-LABEL: add_i32_varying:
+; GFX1264_DPP: ; %bb.0: ; %entry
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1264_DPP-NEXT: s_not_b64 exec, exec
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1264_DPP-NEXT: s_not_b64 exec, exec
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63
+; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX1264_DPP-NEXT: s_mov_b32 s4, s9
+; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1264_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1264_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1264_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX1264_DPP-NEXT: ; %bb.1:
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1264_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1264_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
+; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
+; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264_DPP-NEXT: .LBB2_2:
+; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1264_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1264_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264_DPP-NEXT: s_nop 0
+; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_DPP-NEXT: s_endpgm
+;
+; GFX1232_DPP-LABEL: add_i32_varying:
+; GFX1232_DPP: ; %bb.0: ; %entry
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
+; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX1232_DPP-NEXT: ; %bb.1:
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1232_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1232_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1232_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
+; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
+; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232_DPP-NEXT: .LBB2_2:
+; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1232_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1232_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232_DPP-NEXT: s_nop 0
+; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232_DPP-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel
+ store i32 %old, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
+; GFX7LESS-LABEL: add_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
+; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec
; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: s_mov_b32 s10, s6
-; GFX7LESS-NEXT: s_mov_b32 s11, s7
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5
+; GFX7LESS-NEXT: s_mov_b32 s10, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s8, s2
; GFX7LESS-NEXT: s_mov_b32 s9, s3
-; GFX7LESS-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: buffer_wbinvl1
-; GFX7LESS-NEXT: s_mov_b32 s4, s0
-; GFX7LESS-NEXT: s_mov_b32 s5, s1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS-NEXT: .LBB3_2:
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0
+; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX8-LABEL: add_i32_varying:
+; GFX8-LABEL: add_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s6, 0
-; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s4
-; GFX8-NEXT: v_readlane_b32 s7, v0, s4
-; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s7
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execz .LBB2_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: s_mov_b32 s11, 0xf000
-; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_cbranch_execz .LBB3_2
+; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s8, s2
+; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT: s_mul_i32 s2, s2, 5
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
; GFX8-NEXT: s_mov_b32 s9, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: .LBB2_4:
+; GFX8-NEXT: .LBB3_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_nop 2
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
-; GFX9-LABEL: add_i32_varying:
+; GFX9-LABEL: add_i64_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s6, 0
-; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s4
-; GFX9-NEXT: v_readlane_b32 s7, v0, s4
-; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s7
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
+; GFX9-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execz .LBB2_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_cbranch_execz .LBB3_2
+; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT: s_mul_i32 s2, s2, 5
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s9, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: .LBB2_4:
+; GFX9-NEXT: .LBB3_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s3, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_nop 2
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: add_i32_varying:
+; GFX1064-LABEL: add_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s6, 0
-; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1064-NEXT: v_writelane_b32 v1, s6, s7
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064-NEXT: s_add_i32 s6, s6, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0
+; GFX1064-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064-NEXT: s_cbranch_execz .LBB2_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: s_cbranch_execz .LBB3_2
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: s_mul_i32 s6, s6, 5
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT: v_mov_b32_e32 v0, s6
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_mov_b32 s8, s2
; GFX1064-NEXT: s_mov_b32 s9, s3
-; GFX1064-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB2_4:
+; GFX1064-NEXT: .LBB3_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1
; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: add_i32_varying:
+; GFX1032-LABEL: add_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: s_mov_b32 s4, 0
-; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1032-NEXT: s_andn2_b32 s2, s2, s6
-; GFX1032-NEXT: s_add_i32 s4, s4, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5
-; GFX1032-NEXT: s_cbranch_execz .LBB2_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB3_2
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: s_mul_i32 s5, s5, 5
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT: v_mov_b32_e32 v0, s5
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_mov_b32 s8, s2
; GFX1032-NEXT: s_mov_b32 s9, s3
-; GFX1032-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB2_4:
+; GFX1032-NEXT: .LBB3_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1
; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: add_i32_varying:
+; GFX1164-LABEL: add_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s6, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
-; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1164-NEXT: v_writelane_b32 v1, s6, s7
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT: s_add_i32 s6, s6, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1164-NEXT: s_cbranch_execz .LBB2_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB3_2
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: s_mul_i32 s6, s6, 5
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT: v_mov_b32_e32 v0, s6
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_mov_b32 s8, s2
; GFX1164-NEXT: s_mov_b32 s9, s3
-; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
+; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB2_4:
+; GFX1164-NEXT: .LBB3_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: add_i32_varying:
+; GFX1132-LABEL: add_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
-; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1132-NEXT: s_add_i32 s4, s4, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5
-; GFX1132-NEXT: s_cbranch_execz .LBB2_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, s4
+; GFX1132-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB3_2
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT: s_mul_i32 s5, s5, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
+; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_mov_b32 s8, s2
; GFX1132-NEXT: s_mov_b32 s9, s3
-; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
+; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB2_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1132-NEXT: .LBB3_2:
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
-; GFX1264-LABEL: add_i32_varying:
+; GFX1264-LABEL: add_i64_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
-; GFX1264-NEXT: s_mov_b32 s6, 0
-; GFX1264-NEXT: ; implicit-def: $vgpr1
-; GFX1264-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1264-NEXT: v_writelane_b32 v1, s6, s7
-; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT: s_add_co_i32 s6, s6, s8
-; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1264-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1264-NEXT: ; implicit-def: $vgpr0
-; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1264-NEXT: s_cbranch_execz .LBB2_4
-; GFX1264-NEXT: ; %bb.3:
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
-; GFX1264-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT: s_wait_loadcnt 0x0
-; GFX1264-NEXT: global_inv scope:SCOPE_DEV
-; GFX1264-NEXT: .LBB2_4:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
-; GFX1264-NEXT: s_nop 0
-; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1264-NEXT: s_endpgm
-;
-; GFX1232-LABEL: add_i32_varying:
-; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, 0
-; GFX1232-NEXT: ; implicit-def: $vgpr1
-; GFX1232-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1232-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1232-NEXT: s_add_co_i32 s4, s4, s5
-; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1232-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1232-NEXT: ; implicit-def: $vgpr0
-; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5
-; GFX1232-NEXT: s_cbranch_execz .LBB2_4
-; GFX1232-NEXT: ; %bb.3:
-; GFX1232-NEXT: v_mov_b32_e32 v0, s4
-; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
-; GFX1232-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT: s_wait_loadcnt 0x0
-; GFX1232-NEXT: global_inv scope:SCOPE_DEV
-; GFX1232-NEXT: .LBB2_4:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
-; GFX1232-NEXT: s_nop 0
-; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1232-NEXT: s_endpgm
-entry:
- %lane = call i32 @llvm.amdgcn.workitem.id.x()
- %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel
- store i32 %old, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
-; GFX7LESS-LABEL: add_i64_constant:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec
-; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2
-; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5
-; GFX7LESS-NEXT: s_mov_b32 s10, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s8, s2
-; GFX7LESS-NEXT: s_mov_b32 s9, s3
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
-; GFX7LESS-NEXT: .LBB3_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0
-; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
-;
-; GFX89-LABEL: add_i64_constant:
-; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b64 s[6:7], exec
-; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX89-NEXT: s_cbranch_execz .LBB3_2
-; GFX89-NEXT: ; %bb.1:
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
-; GFX89-NEXT: s_mul_i32 s2, s2, 5
-; GFX89-NEXT: s_mov_b32 s11, 0xf000
-; GFX89-NEXT: s_mov_b32 s10, -1
-; GFX89-NEXT: s_mov_b32 s9, s3
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, 0
-; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: .LBB3_2:
-; GFX89-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_readfirstlane_b32 s2, v1
-; GFX89-NEXT: v_readfirstlane_b32 s3, v0
-; GFX89-NEXT: v_mov_b32_e32 v0, s3
-; GFX89-NEXT: v_mov_b32_e32 v1, s2
-; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
-; GFX89-NEXT: s_nop 2
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX89-NEXT: s_endpgm
-;
-; GFX1064-LABEL: add_i64_constant:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB3_2
-; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
-; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
-; GFX1064-NEXT: s_mov_b32 s10, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
-; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: buffer_gl1_inv
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB3_2:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
-;
-; GFX1032-LABEL: add_i64_constant:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
-; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB3_2
-; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
-; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
-; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
-; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: buffer_gl1_inv
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB3_2:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
-;
-; GFX1164-LABEL: add_i64_constant:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
-; GFX1164-NEXT: s_cbranch_execz .LBB3_2
-; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
-; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
-; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
-; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: buffer_gl1_inv
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB3_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
-;
-; GFX1132-LABEL: add_i64_constant:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
-; GFX1132-NEXT: s_cbranch_execz .LBB3_2
-; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
-; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
-; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
-; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: buffer_gl1_inv
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB3_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
-;
-; GFX1264-LABEL: add_i64_constant:
-; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b32 s9, 0
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264-NEXT: s_mov_b32 s9, 0
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-NEXT: s_mov_b64 s[4:5], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -1709,475 +2328,1366 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s6, -1
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
-; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX1232-NEXT: s_nop 0
-; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1232-NEXT: s_endpgm
-entry:
- %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive syncscope("agent") acq_rel
- store i64 %old, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
-; GFX7LESS-LABEL: add_i64_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 s10, s6
-; GFX7LESS-NEXT: s_mov_b32 s11, s7
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s8, s2
-; GFX7LESS-NEXT: s_mov_b32 s9, s3
-; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
-; GFX7LESS-NEXT: s_mov_b32 s4, s0
-; GFX7LESS-NEXT: s_mov_b32 s5, s1
-; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX7LESS-NEXT: s_endpgm
-;
-; GFX8-LABEL: add_i64_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX8-NEXT: .LBB5_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s6, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s6
-; GFX8-NEXT: v_readlane_b32 s8, v0, s6
-; GFX8-NEXT: v_readlane_b32 s7, v3, s6
-; GFX8-NEXT: v_writelane_b32 v1, s4, m0
-; GFX8-NEXT: s_add_u32 s4, s4, s8
-; GFX8-NEXT: v_writelane_b32 v2, s5, m0
-; GFX8-NEXT: s_addc_u32 s5, s5, s7
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB5_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GFX8-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execz .LBB5_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: s_mov_b32 s11, 0xf000
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: .LBB5_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v4
-; GFX8-NEXT: v_readfirstlane_b32 s5, v3
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: add_i64_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s6, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s6
-; GFX9-NEXT: v_readlane_b32 s8, v0, s6
-; GFX9-NEXT: v_readlane_b32 s7, v3, s6
-; GFX9-NEXT: v_writelane_b32 v1, s4, m0
-; GFX9-NEXT: s_add_u32 s4, s4, s8
-; GFX9-NEXT: v_writelane_b32 v2, s5, m0
-; GFX9-NEXT: s_addc_u32 s5, s5, s7
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX9-NEXT: s_cbranch_execz .LBB5_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
-; GFX9-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: .LBB5_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v4
-; GFX9-NEXT: v_readfirstlane_b32 s5, v3
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
-;
-; GFX1064-LABEL: add_i64_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b64 s[4:5], 0
-; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064-NEXT: v_writelane_b32 v1, s4, s6
-; GFX1064-NEXT: v_writelane_b32 v2, s5, s6
-; GFX1064-NEXT: s_add_u32 s4, s4, s7
-; GFX1064-NEXT: s_addc_u32 s5, s5, s8
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1064-NEXT: s_cbranch_execz .LBB5_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v3, s4
-; GFX1064-NEXT: v_mov_b32_e32 v4, s5
-; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s10, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
-; GFX1064-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: buffer_gl1_inv
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB5_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v1
-; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
-;
-; GFX1032-LABEL: add_i64_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: s_mov_b64 s[4:5], 0
-; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1032-NEXT: v_writelane_b32 v2, s5, s3
-; GFX1032-NEXT: s_add_u32 s4, s4, s6
-; GFX1032-NEXT: s_addc_u32 s5, s5, s7
-; GFX1032-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1032-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s6, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s6, exec_lo, s6
-; GFX1032-NEXT: s_cbranch_execz .LBB5_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v3, s4
-; GFX1032-NEXT: v_mov_b32_e32 v4, s5
-; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
-; GFX1032-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: buffer_gl1_inv
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB5_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
-; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
-;
-; GFX1164-LABEL: add_i64_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s6
-; GFX1164-NEXT: v_writelane_b32 v2, s5, s6
-; GFX1164-NEXT: s_add_u32 s4, s4, s7
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: s_addc_u32 s5, s5, s8
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164-NEXT: s_cbranch_execz .LBB5_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v3, s4
-; GFX1164-NEXT: v_mov_b32_e32 v4, s5
-; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
-; GFX1164-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: buffer_gl1_inv
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB5_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_add_co_u32 v0, vcc, s2, v1
-; GFX1164-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
-;
-; GFX1132-LABEL: add_i64_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b64 s[4:5], 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1132-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1132-NEXT: v_writelane_b32 v2, s5, s3
-; GFX1132-NEXT: s_add_u32 s4, s4, s6
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: s_addc_u32 s5, s5, s7
-; GFX1132-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s6, exec_lo
-; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_xor_b32 s6, exec_lo, s6
-; GFX1132-NEXT: s_cbranch_execz .LBB5_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
-; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
-; GFX1132-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: buffer_gl1_inv
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB5_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
-; GFX1132-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
-;
-; GFX1264-LABEL: add_i64_varying:
-; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: v_mov_b32_e32 v3, 0
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
-; GFX1264-NEXT: s_mov_b64 s[4:5], 0
-; GFX1264-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1264-NEXT: .LBB5_1: ; %ComputeLoop
-; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_ctz_i32_b64 s10, s[2:3]
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1264-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1264-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1264-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1264-NEXT: v_writelane_b32 v2, s5, s10
-; GFX1264-NEXT: v_writelane_b32 v1, s4, s10
-; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
-; GFX1264-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
-; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1264-NEXT: s_cbranch_scc1 .LBB5_1
-; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1264-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1264-NEXT: s_cbranch_execz .LBB5_4
-; GFX1264-NEXT: ; %bb.3:
-; GFX1264-NEXT: v_mov_b32_e32 v3, s4
-; GFX1264-NEXT: v_mov_b32_e32 v4, s5
-; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
-; GFX1264-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT: s_wait_loadcnt 0x0
-; GFX1264-NEXT: global_inv scope:SCOPE_DEV
-; GFX1264-NEXT: .LBB5_4:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1264-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_add_co_u32 v0, vcc, s2, v1
-; GFX1264-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX1264-NEXT: s_nop 0
-; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1264-NEXT: s_endpgm
-;
-; GFX1232-LABEL: add_i64_varying:
-; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: v_mov_b32_e32 v3, 0
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b64 s[4:5], 0
-; GFX1232-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1232-NEXT: .LBB5_1: ; %ComputeLoop
-; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1232-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1232-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1232-NEXT: s_lshl_b32 s8, 1, s3
-; GFX1232-NEXT: v_writelane_b32 v2, s5, s3
-; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1232-NEXT: s_and_not1_b32 s2, s2, s8
-; GFX1232-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
-; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1232-NEXT: s_cbranch_scc1 .LBB5_1
-; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1232-NEXT: s_mov_b32 s6, exec_lo
-; GFX1232-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1232-NEXT: s_xor_b32 s6, exec_lo, s6
-; GFX1232-NEXT: s_cbranch_execz .LBB5_4
-; GFX1232-NEXT: ; %bb.3:
-; GFX1232-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
-; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
-; GFX1232-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT: s_wait_loadcnt 0x0
-; GFX1232-NEXT: global_inv scope:SCOPE_DEV
-; GFX1232-NEXT: .LBB5_4:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1232-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
-; GFX1232-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
+; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
+; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
+entry:
+ %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive syncscope("agent") acq_rel
+ store i64 %old, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
+; GFX7LESS_ITERATIVE-LABEL: add_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB5_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1
+; GFX7LESS_ITERATIVE-NEXT: .LBB5_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s5, v1
+; GFX7LESS_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: add_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_add_u32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0
+; GFX8_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX8_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol
+; GFX8_ITERATIVE-NEXT: .LBB5_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s5, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: add_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_add_u32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0
+; GFX9_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX9_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol
+; GFX9_ITERATIVE-NEXT: .LBB5_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: add_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1064_ITERATIVE-NEXT: s_add_u32 s4, s4, s7
+; GFX1064_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1064_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB5_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: add_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1032_ITERATIVE-NEXT: s_add_u32 s4, s4, s6
+; GFX1032_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1032_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB5_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: add_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1164_ITERATIVE-NEXT: s_add_u32 s4, s4, s7
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB5_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: add_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1132_ITERATIVE-NEXT: s_add_u32 s4, s4, s6
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB5_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1264_ITERATIVE-LABEL: add_i64_varying:
+; GFX1264_ITERATIVE: ; %bb.0: ; %entry
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s10
+; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0
+; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264_ITERATIVE-NEXT: .LBB5_4:
+; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1264_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264_ITERATIVE-NEXT: s_nop 0
+; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1232_ITERATIVE-LABEL: add_i64_varying:
+; GFX1232_ITERATIVE: ; %bb.0: ; %entry
+; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
+; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1232_ITERATIVE-NEXT: ; %bb.3:
+; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
+; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232_ITERATIVE-NEXT: .LBB5_4:
+; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1232_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232_ITERATIVE-NEXT: s_nop 0
+; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: add_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6
+; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2
+; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3
+; GFX7LESS_DPP-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
+; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS_DPP-NEXT: buffer_wbinvl1
+; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0
+; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: add_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7
+; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s10, -1
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: s_mov_b32 s8, s2
+; GFX8_DPP-NEXT: s_mov_b32 s9, s3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s6
+; GFX8_DPP-NEXT: buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc
+; GFX8_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX8_DPP-NEXT: buffer_wbinvl1_vol
+; GFX8_DPP-NEXT: .LBB5_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v8, vcc, v0, v8, vcc
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: add_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7
+; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s10, -1
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: s_mov_b32 s8, s2
+; GFX9_DPP-NEXT: s_mov_b32 s9, s3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6
+; GFX9_DPP-NEXT: buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc
+; GFX9_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9_DPP-NEXT: buffer_wbinvl1_vol
+; GFX9_DPP-NEXT: .LBB5_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s5, v7
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v8, vcc, v0, v8, vcc
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: add_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1064_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1064_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl1_inv
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB5_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s2, v11
+; GFX1064_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s3, v12, vcc
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: add_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1032_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1032_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl1_inv
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB5_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s2, v11
+; GFX1032_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: add_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s10, v3, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s11, v4, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9]
+; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s4
+; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1164_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1164_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc
+; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl1_inv
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB5_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v9
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s2, v10
+; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: add_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_readlane_b32 s8, v4, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1132_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc
+; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl1_inv
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB5_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10
+; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+;
+; GFX1264_DPP-LABEL: add_i64_varying:
+; GFX1264_DPP: ; %bb.0: ; %entry
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1264_DPP-NEXT: s_not_b64 exec, exec
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1264_DPP-NEXT: s_not_b64 exec, exec
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 15
+; GFX1264_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1264_DPP-NEXT: v_readlane_b32 s9, v4, 31
+; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16
+; GFX1264_DPP-NEXT: v_readlane_b32 s10, v3, 47
+; GFX1264_DPP-NEXT: v_readlane_b32 s11, v4, 47
+; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48
+; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9]
+; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec
+; GFX1264_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1264_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX1264_DPP-NEXT: ; %bb.1:
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, s5
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, s4
+; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1264_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1264_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN
+; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
+; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264_DPP-NEXT: .LBB5_2:
+; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v9
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_u32 v8, vcc, s2, v10
+; GFX1264_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc
+; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null
+; GFX1264_DPP-NEXT: s_nop 0
+; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_DPP-NEXT: s_endpgm
+;
+; GFX1232_DPP-LABEL: add_i64_varying:
+; GFX1232_DPP: ; %bb.0: ; %entry
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1232_DPP-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4
+; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
+; GFX1232_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_readlane_b32 s8, v4, 15
+; GFX1232_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo
+; GFX1232_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1232_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1232_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX1232_DPP-NEXT: ; %bb.1:
+; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
+; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1232_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1232_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1232_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN
+; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
+; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232_DPP-NEXT: .LBB5_2:
+; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10
+; GFX1232_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
+; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null
+; GFX1232_DPP-NEXT: s_nop 0
+; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
@@ -2773,475 +4283,1016 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
-; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1264-NEXT: s_cbranch_execz .LBB7_2
-; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1264-NEXT: s_mov_b32 s15, 0x31016000
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mul_i32 s2, s8, s2
-; GFX1264-NEXT: s_mov_b32 s14, -1
-; GFX1264-NEXT: v_mov_b32_e32 v1, s2
-; GFX1264-NEXT: s_mov_b32 s12, s6
-; GFX1264-NEXT: s_mov_b32 s13, s7
-; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT: s_wait_loadcnt 0x0
-; GFX1264-NEXT: global_inv scope:SCOPE_DEV
-; GFX1264-NEXT: .LBB7_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s6, -1
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
-; GFX1264-NEXT: s_nop 0
-; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1264-NEXT: s_endpgm
-;
-; GFX1232-LABEL: sub_i32_uniform:
-; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_clause 0x1
-; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1232-NEXT: s_load_b32 s0, s[0:1], 0x34
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s1, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1232-NEXT: ; implicit-def: $vgpr1
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1232-NEXT: s_cbranch_execz .LBB7_2
-; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mul_i32 s2, s0, s2
-; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_mov_b32_e32 v1, s2
-; GFX1232-NEXT: s_mov_b32 s8, s6
-; GFX1232-NEXT: s_mov_b32 s9, s7
-; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT: s_wait_loadcnt 0x0
-; GFX1232-NEXT: global_inv scope:SCOPE_DEV
-; GFX1232-NEXT: .LBB7_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0
-; GFX1232-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s6, -1
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
-; GFX1232-NEXT: s_nop 0
-; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1232-NEXT: s_endpgm
-entry:
- %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel
- store i32 %old, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
-; GFX7LESS-LABEL: sub_i32_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: s_mov_b32 s10, s6
-; GFX7LESS-NEXT: s_mov_b32 s11, s7
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s8, s2
-; GFX7LESS-NEXT: s_mov_b32 s9, s3
-; GFX7LESS-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
-; GFX7LESS-NEXT: s_mov_b32 s4, s0
-; GFX7LESS-NEXT: s_mov_b32 s5, s1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX7LESS-NEXT: s_endpgm
-;
-; GFX8-LABEL: sub_i32_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s6, 0
-; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: .LBB8_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s4
-; GFX8-NEXT: v_readlane_b32 s7, v0, s4
-; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s7
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB8_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execz .LBB8_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: s_mov_b32 s11, 0xf000
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: .LBB8_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: sub_i32_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s6, 0
-; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: .LBB8_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s4
-; GFX9-NEXT: v_readlane_b32 s7, v0, s4
-; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s7
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execz .LBB8_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: .LBB8_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
-;
-; GFX1064-LABEL: sub_i32_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s6, 0
-; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1064-NEXT: v_writelane_b32 v1, s6, s7
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064-NEXT: s_add_i32 s6, s6, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064-NEXT: s_cbranch_execz .LBB8_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
-; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s10, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
-; GFX1064-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: buffer_gl1_inv
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB8_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
-;
-; GFX1032-LABEL: sub_i32_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: s_mov_b32 s4, 0
-; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1032-NEXT: s_andn2_b32 s2, s2, s6
-; GFX1032-NEXT: s_add_i32 s4, s4, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5
-; GFX1032-NEXT: s_cbranch_execz .LBB8_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s4
-; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
-; GFX1032-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: buffer_gl1_inv
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB8_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
-;
-; GFX1164-LABEL: sub_i32_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s6, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
-; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1164-NEXT: v_writelane_b32 v1, s6, s7
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT: s_add_i32 s6, s6, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1164-NEXT: s_cbranch_execz .LBB8_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
-; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
-; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: buffer_gl1_inv
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB8_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
-;
-; GFX1132-LABEL: sub_i32_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
-; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1132-NEXT: s_add_i32 s4, s4, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5
-; GFX1132-NEXT: s_cbranch_execz .LBB8_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, s4
-; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
-; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: buffer_gl1_inv
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB8_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
-;
-; GFX1264-LABEL: sub_i32_varying:
-; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
-; GFX1264-NEXT: s_mov_b32 s6, 0
-; GFX1264-NEXT: ; implicit-def: $vgpr1
-; GFX1264-NEXT: .LBB8_1: ; %ComputeLoop
-; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1264-NEXT: v_writelane_b32 v1, s6, s7
-; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT: s_add_co_i32 s6, s6, s8
-; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1264-NEXT: s_cbranch_scc1 .LBB8_1
-; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1264-NEXT: ; implicit-def: $vgpr0
-; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1264-NEXT: s_cbranch_execz .LBB8_4
-; GFX1264-NEXT: ; %bb.3:
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1264-NEXT: s_cbranch_execz .LBB7_2
+; GFX1264-NEXT: ; %bb.1:
+; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1264-NEXT: s_mov_b32 s15, 0x31016000
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
-; GFX1264-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1264-NEXT: s_mul_i32 s2, s8, s2
+; GFX1264-NEXT: s_mov_b32 s14, -1
+; GFX1264-NEXT: v_mov_b32_e32 v1, s2
+; GFX1264-NEXT: s_mov_b32 s12, s6
+; GFX1264-NEXT: s_mov_b32 s13, s7
+; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
-; GFX1264-NEXT: .LBB8_4:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: .LBB7_2:
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
-; GFX1232-LABEL: sub_i32_varying:
+; GFX1232-LABEL: sub_i32_uniform:
; GFX1232: ; %bb.0: ; %entry
+; GFX1232-NEXT: s_clause 0x1
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_load_b32 s0, s[0:1], 0x34
; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, 0
+; GFX1232-NEXT: s_mov_b32 s1, exec_lo
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
-; GFX1232-NEXT: .LBB8_1: ; %ComputeLoop
-; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1232-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1232-NEXT: s_add_co_i32 s4, s4, s5
-; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1232-NEXT: s_cbranch_scc1 .LBB8_1
-; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1232-NEXT: ; implicit-def: $vgpr0
-; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5
-; GFX1232-NEXT: s_cbranch_execz .LBB8_4
-; GFX1232-NEXT: ; %bb.3:
-; GFX1232-NEXT: v_mov_b32_e32 v0, s4
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1232-NEXT: s_cbranch_execz .LBB7_2
+; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s10, -1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
-; GFX1232-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232-NEXT: s_mul_i32 s2, s0, s2
+; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: v_mov_b32_e32 v1, s2
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
+; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
-; GFX1232-NEXT: .LBB8_4:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1232-NEXT: .LBB7_2:
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
+entry:
+ %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel
+ store i32 %old, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
+; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s4
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB8_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1
+; GFX7LESS_ITERATIVE-NEXT: .LBB8_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: sub_i32_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX8_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s4
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX8_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol
+; GFX8_ITERATIVE-NEXT: .LBB8_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
+; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: sub_i32_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX9_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX9_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol
+; GFX9_ITERATIVE-NEXT: .LBB8_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: sub_i32_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB8_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: sub_i32_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1032_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s6
+; GFX1032_ITERATIVE-NEXT: s_add_i32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB8_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: sub_i32_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
+; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB8_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: sub_i32_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1132_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s6
+; GFX1132_ITERATIVE-NEXT: s_add_i32 s4, s4, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
+; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB8_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1264_ITERATIVE-LABEL: sub_i32_varying:
+; GFX1264_ITERATIVE: ; %bb.0: ; %entry
+; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0
+; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
+; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0
+; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264_ITERATIVE-NEXT: .LBB8_4:
+; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264_ITERATIVE-NEXT: s_nop 0
+; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1232_ITERATIVE-LABEL: sub_i32_varying:
+; GFX1232_ITERATIVE: ; %bb.0: ; %entry
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3
+; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s6
+; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
+; GFX1232_ITERATIVE-NEXT: ; %bb.3:
+; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
+; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232_ITERATIVE-NEXT: .LBB8_4:
+; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232_ITERATIVE-NEXT: s_nop 0
+; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: sub_i32_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1
+; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6
+; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2
+; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3
+; GFX7LESS_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS_DPP-NEXT: buffer_wbinvl1
+; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0
+; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1
+; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: sub_i32_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB8_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s10, -1
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: s_mov_b32 s8, s2
+; GFX8_DPP-NEXT: s_mov_b32 s9, s3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6
+; GFX8_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX8_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX8_DPP-NEXT: buffer_wbinvl1_vol
+; GFX8_DPP-NEXT: .LBB8_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: sub_i32_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB8_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s10, -1
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: s_mov_b32 s8, s2
+; GFX9_DPP-NEXT: s_mov_b32 s9, s3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6
+; GFX9_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX9_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9_DPP-NEXT: buffer_wbinvl1_vol
+; GFX9_DPP-NEXT: .LBB8_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: sub_i32_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX1064_DPP-NEXT: s_mov_b32 s4, s9
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB8_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1064_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1064_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
+; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl1_inv
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB8_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: sub_i32_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s4, s6
+; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1032_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1032_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
+; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl1_inv
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB8_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: sub_i32_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 63
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX1164_DPP-NEXT: s_mov_b32 s4, s9
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB8_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1164_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1164_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
+; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl1_inv
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB8_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: sub_i32_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
+; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1132_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
+; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl1_inv
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB8_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+;
+; GFX1264_DPP-LABEL: sub_i32_varying:
+; GFX1264_DPP: ; %bb.0: ; %entry
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1264_DPP-NEXT: s_not_b64 exec, exec
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1264_DPP-NEXT: s_not_b64 exec, exec
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63
+; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX1264_DPP-NEXT: s_mov_b32 s4, s9
+; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1264_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1264_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1264_DPP-NEXT: s_cbranch_execz .LBB8_2
+; GFX1264_DPP-NEXT: ; %bb.1:
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1264_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1264_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
+; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
+; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264_DPP-NEXT: .LBB8_2:
+; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1264_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1264_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264_DPP-NEXT: s_nop 0
+; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_DPP-NEXT: s_endpgm
+;
+; GFX1232_DPP-LABEL: sub_i32_varying:
+; GFX1232_DPP: ; %bb.0: ; %entry
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
+; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2
+; GFX1232_DPP-NEXT: ; %bb.1:
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1232_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1232_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1232_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
+; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
+; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232_DPP-NEXT: .LBB8_2:
+; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1232_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1232_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232_DPP-NEXT: s_nop 0
+; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel
@@ -4023,459 +6074,1350 @@ entry:
}
define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
-; GFX7LESS-LABEL: sub_i64_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 s10, s6
-; GFX7LESS-NEXT: s_mov_b32 s11, s7
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s8, s2
-; GFX7LESS-NEXT: s_mov_b32 s9, s3
-; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
-; GFX7LESS-NEXT: s_mov_b32 s4, s0
-; GFX7LESS-NEXT: s_mov_b32 s5, s1
-; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX7LESS-NEXT: s_endpgm
+; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB11_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1
+; GFX7LESS_ITERATIVE-NEXT: .LBB11_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s5, v1
+; GFX7LESS_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
;
-; GFX8-LABEL: sub_i64_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX8-NEXT: .LBB11_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s6, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s6
-; GFX8-NEXT: v_readlane_b32 s8, v0, s6
-; GFX8-NEXT: v_readlane_b32 s7, v3, s6
-; GFX8-NEXT: v_writelane_b32 v1, s4, m0
-; GFX8-NEXT: s_add_u32 s4, s4, s8
-; GFX8-NEXT: v_writelane_b32 v2, s5, m0
-; GFX8-NEXT: s_addc_u32 s5, s5, s7
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB11_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GFX8-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execz .LBB11_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: s_mov_b32 s11, 0xf000
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: .LBB11_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v4
-; GFX8-NEXT: v_readfirstlane_b32 s5, v3
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
+; GFX8_ITERATIVE-LABEL: sub_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_add_u32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0
+; GFX8_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX8_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol
+; GFX8_ITERATIVE-NEXT: .LBB11_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s5, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
;
-; GFX9-LABEL: sub_i64_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX9-NEXT: .LBB11_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s6, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s6
-; GFX9-NEXT: v_readlane_b32 s8, v0, s6
-; GFX9-NEXT: v_readlane_b32 s7, v3, s6
-; GFX9-NEXT: v_writelane_b32 v1, s4, m0
-; GFX9-NEXT: s_add_u32 s4, s4, s8
-; GFX9-NEXT: v_writelane_b32 v2, s5, m0
-; GFX9-NEXT: s_addc_u32 s5, s5, s7
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX9-NEXT: s_cbranch_execz .LBB11_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
-; GFX9-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: .LBB11_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v4
-; GFX9-NEXT: v_readfirstlane_b32 s5, v3
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
+; GFX9_ITERATIVE-LABEL: sub_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_add_u32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0
+; GFX9_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX9_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol
+; GFX9_ITERATIVE-NEXT: .LBB11_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
;
-; GFX1064-LABEL: sub_i64_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b64 s[4:5], 0
-; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064-NEXT: v_writelane_b32 v1, s4, s6
-; GFX1064-NEXT: v_writelane_b32 v2, s5, s6
-; GFX1064-NEXT: s_add_u32 s4, s4, s7
-; GFX1064-NEXT: s_addc_u32 s5, s5, s8
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1064-NEXT: s_cbranch_execz .LBB11_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v3, s4
-; GFX1064-NEXT: v_mov_b32_e32 v4, s5
-; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s10, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
-; GFX1064-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: buffer_gl1_inv
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB11_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
+; GFX1064_ITERATIVE-LABEL: sub_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1064_ITERATIVE-NEXT: s_add_u32 s4, s4, s7
+; GFX1064_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB11_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
-; GFX1032-LABEL: sub_i64_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: s_mov_b64 s[4:5], 0
-; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1032-NEXT: v_writelane_b32 v2, s5, s3
-; GFX1032-NEXT: s_add_u32 s4, s4, s6
-; GFX1032-NEXT: s_addc_u32 s5, s5, s7
-; GFX1032-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1032-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s6, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s6, exec_lo, s6
-; GFX1032-NEXT: s_cbranch_execz .LBB11_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v3, s4
-; GFX1032-NEXT: v_mov_b32_e32 v4, s5
-; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
-; GFX1032-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: buffer_gl1_inv
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB11_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
+; GFX1032_ITERATIVE-LABEL: sub_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1032_ITERATIVE-NEXT: s_add_u32 s4, s4, s6
+; GFX1032_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB11_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX1164-LABEL: sub_i64_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s6
-; GFX1164-NEXT: v_writelane_b32 v2, s5, s6
-; GFX1164-NEXT: s_add_u32 s4, s4, s7
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: s_addc_u32 s5, s5, s8
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164-NEXT: s_cbranch_execz .LBB11_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v3, s4
-; GFX1164-NEXT: v_mov_b32_e32 v4, s5
-; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
-; GFX1164-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: buffer_gl1_inv
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB11_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v1
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
+; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1164_ITERATIVE-NEXT: s_add_u32 s4, s4, s7
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB11_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
-; GFX1132-LABEL: sub_i64_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b64 s[4:5], 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1132-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1132-NEXT: v_writelane_b32 v2, s5, s3
-; GFX1132-NEXT: s_add_u32 s4, s4, s6
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: s_addc_u32 s5, s5, s7
-; GFX1132-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s6, exec_lo
-; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_xor_b32 s6, exec_lo, s6
-; GFX1132-NEXT: s_cbranch_execz .LBB11_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
-; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
-; GFX1132-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: buffer_gl1_inv
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB11_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
+; GFX1132_ITERATIVE-LABEL: sub_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1132_ITERATIVE-NEXT: s_add_u32 s4, s4, s6
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB11_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
;
-; GFX1264-LABEL: sub_i64_varying:
-; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: v_mov_b32_e32 v3, 0
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
-; GFX1264-NEXT: s_mov_b64 s[4:5], 0
-; GFX1264-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1264-NEXT: .LBB11_1: ; %ComputeLoop
-; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_ctz_i32_b64 s10, s[2:3]
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1264-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1264-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1264-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1264-NEXT: v_writelane_b32 v2, s5, s10
-; GFX1264-NEXT: v_writelane_b32 v1, s4, s10
-; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
-; GFX1264-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
-; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1264-NEXT: s_cbranch_scc1 .LBB11_1
-; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1264-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1264-NEXT: s_cbranch_execz .LBB11_4
-; GFX1264-NEXT: ; %bb.3:
-; GFX1264-NEXT: v_mov_b32_e32 v3, s4
-; GFX1264-NEXT: v_mov_b32_e32 v4, s5
-; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
-; GFX1264-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT: s_wait_loadcnt 0x0
-; GFX1264-NEXT: global_inv scope:SCOPE_DEV
-; GFX1264-NEXT: .LBB11_4:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1264-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v1
-; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX1264-NEXT: s_nop 0
-; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1264-NEXT: s_endpgm
+; GFX1264_ITERATIVE-LABEL: sub_i64_varying:
+; GFX1264_ITERATIVE: ; %bb.0: ; %entry
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s10
+; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0
+; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264_ITERATIVE-NEXT: .LBB11_4:
+; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1264_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264_ITERATIVE-NEXT: s_nop 0
+; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_ITERATIVE-NEXT: s_endpgm
;
-; GFX1232-LABEL: sub_i64_varying:
-; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: v_mov_b32_e32 v3, 0
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b64 s[4:5], 0
-; GFX1232-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1232-NEXT: .LBB11_1: ; %ComputeLoop
-; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1232-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1232-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1232-NEXT: s_lshl_b32 s8, 1, s3
-; GFX1232-NEXT: v_writelane_b32 v2, s5, s3
-; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1232-NEXT: s_and_not1_b32 s2, s2, s8
-; GFX1232-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
-; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1232-NEXT: s_cbranch_scc1 .LBB11_1
-; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1232-NEXT: s_mov_b32 s6, exec_lo
-; GFX1232-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1232-NEXT: s_xor_b32 s6, exec_lo, s6
-; GFX1232-NEXT: s_cbranch_execz .LBB11_4
-; GFX1232-NEXT: ; %bb.3:
-; GFX1232-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
-; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
-; GFX1232-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT: s_wait_loadcnt 0x0
-; GFX1232-NEXT: global_inv scope:SCOPE_DEV
-; GFX1232-NEXT: .LBB11_4:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1232-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
-; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX1232-NEXT: s_nop 0
-; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1232-NEXT: s_endpgm
+; GFX1232_ITERATIVE-LABEL: sub_i64_varying:
+; GFX1232_ITERATIVE: ; %bb.0: ; %entry
+; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
+; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX1232_ITERATIVE-NEXT: ; %bb.3:
+; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
+; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232_ITERATIVE-NEXT: .LBB11_4:
+; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1232_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232_ITERATIVE-NEXT: s_nop 0
+; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: sub_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6
+; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2
+; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3
+; GFX7LESS_DPP-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
+; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS_DPP-NEXT: buffer_wbinvl1
+; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0
+; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: sub_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7
+; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s10, -1
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: s_mov_b32 s8, s2
+; GFX8_DPP-NEXT: s_mov_b32 s9, s3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s6
+; GFX8_DPP-NEXT: buffer_atomic_sub_x2 v[7:8], off, s[8:11], 0 glc
+; GFX8_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX8_DPP-NEXT: buffer_wbinvl1_vol
+; GFX8_DPP-NEXT: .LBB11_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_subb_u32_e32 v8, vcc, v0, v8, vcc
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: sub_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7
+; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s10, -1
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: s_mov_b32 s8, s2
+; GFX9_DPP-NEXT: s_mov_b32 s9, s3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6
+; GFX9_DPP-NEXT: buffer_atomic_sub_x2 v[7:8], off, s[8:11], 0 glc
+; GFX9_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9_DPP-NEXT: buffer_wbinvl1_vol
+; GFX9_DPP-NEXT: .LBB11_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s5, v7
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_subb_co_u32_e32 v8, vcc, v0, v8, vcc
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: sub_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1064_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1064_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl1_inv
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB11_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s2, v11
+; GFX1064_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s3, v12, vcc
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: sub_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1032_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1032_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl1_inv
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB11_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s2, v11
+; GFX1032_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: sub_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s10, v3, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s11, v4, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9]
+; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s4
+; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1164_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1164_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], 0 glc
+; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl1_inv
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB11_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v9
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s2, v10
+; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: sub_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_readlane_b32 s8, v4, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1132_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], 0 glc
+; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl1_inv
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB11_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10
+; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+;
+; GFX1264_DPP-LABEL: sub_i64_varying:
+; GFX1264_DPP: ; %bb.0: ; %entry
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1264_DPP-NEXT: s_not_b64 exec, exec
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1264_DPP-NEXT: s_not_b64 exec, exec
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 15
+; GFX1264_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1264_DPP-NEXT: v_readlane_b32 s9, v4, 31
+; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16
+; GFX1264_DPP-NEXT: v_readlane_b32 s10, v3, 47
+; GFX1264_DPP-NEXT: v_readlane_b32 s11, v4, 47
+; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48
+; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9]
+; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec
+; GFX1264_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1264_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX1264_DPP-NEXT: ; %bb.1:
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, s5
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, s4
+; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1264_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1264_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN
+; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
+; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264_DPP-NEXT: .LBB11_2:
+; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v9
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_sub_co_u32 v8, vcc, s2, v10
+; GFX1264_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1264_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc
+; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null
+; GFX1264_DPP-NEXT: s_nop 0
+; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_DPP-NEXT: s_endpgm
+;
+; GFX1232_DPP-LABEL: sub_i64_varying:
+; GFX1232_DPP: ; %bb.0: ; %entry
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1232_DPP-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4
+; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
+; GFX1232_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_readlane_b32 s8, v4, 15
+; GFX1232_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo
+; GFX1232_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1232_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1232_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX1232_DPP-NEXT: ; %bb.1:
+; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
+; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1232_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1232_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1232_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN
+; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
+; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232_DPP-NEXT: .LBB11_2:
+; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10
+; GFX1232_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1232_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
+; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null
+; GFX1232_DPP-NEXT: s_nop 0
+; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 453bd07647c73..dc7b79a903a3d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1,11 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
declare i32 @llvm.amdgcn.workitem.id.x()
@@ -15,8 +22,6 @@ declare i32 @llvm.amdgcn.workitem.id.x()
; Show what the atomic optimization pass will do for local pointers.
define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
-;
-;
; GFX7LESS-LABEL: add_i32_constant:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
@@ -227,8 +232,6 @@ entry:
}
define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) {
-;
-;
; GFX7LESS-LABEL: add_i32_uniform:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
@@ -456,272 +459,630 @@ entry:
}
define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: add_i32_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB2_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s4, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
;
+; GFX8_ITERATIVE-LABEL: add_i32_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB2_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s4, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: add_i32_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
+; GFX9_ITERATIVE-LABEL: add_i32_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX9_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB2_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
;
-; GFX8-LABEL: add_i32_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s4, 0
-; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s5
-; GFX8-NEXT: v_readlane_b32 s8, v0, s5
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: v_writelane_b32 v1, s4, m0
-; GFX8-NEXT: s_add_i32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8-NEXT: s_cbranch_execz .LBB2_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB2_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
+; GFX1064_ITERATIVE-LABEL: add_i32_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s4, s4, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB2_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
-; GFX9-LABEL: add_i32_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s5
-; GFX9-NEXT: v_readlane_b32 s8, v0, s5
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: v_writelane_b32 v1, s4, m0
-; GFX9-NEXT: s_add_i32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execz .LBB2_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB2_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
+; GFX1032_ITERATIVE-LABEL: add_i32_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032_ITERATIVE-NEXT: s_add_i32 s2, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB2_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX1064-LABEL: add_i32_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s4, 0
-; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064-NEXT: s_add_i32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execz .LBB2_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s4
-; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v2
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB2_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
+; GFX1164_ITERATIVE-LABEL: add_i32_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s4, s4, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB2_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
-; GFX1032-LABEL: add_i32_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_add_i32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1032-NEXT: s_cbranch_execz .LBB2_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
-; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB2_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
+; GFX1132_ITERATIVE-LABEL: add_i32_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132_ITERATIVE-NEXT: s_add_i32 s2, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB2_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
;
-; GFX1164-LABEL: add_i32_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s4, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
-; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT: s_add_i32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execz .LBB2_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v2
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB2_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
+; GFX7LESS_DPP-LABEL: add_i32_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_add_rtn_u32 v0, v1, v0
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
;
-; GFX1132-LABEL: add_i32_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
-; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_add_i32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1132-NEXT: s_cbranch_execz .LBB2_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v2
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB2_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
+; GFX8_DPP-LABEL: add_i32_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB2_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: add_i32_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB2_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: add_i32_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1064_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB2_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: add_i32_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s3
+; GFX1032_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB2_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: add_i32_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1164_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB2_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: add_i32_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s3
+; GFX1132_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB2_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel
@@ -730,189 +1091,422 @@ entry:
}
define amdgpu_kernel void @add_i32_varying_nouse() {
-; GFX7LESS-LABEL: add_i32_varying_nouse:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: ds_add_u32 v1, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_endpgm
+; GFX7LESS_ITERATIVE-LABEL: add_i32_varying_nouse:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX7LESS_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB3_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_add_u32 v0, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB3_4:
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
;
-; GFX8-LABEL: add_i32_varying_nouse:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[0:1], exec
-; GFX8-NEXT: s_mov_b32 s2, 0
-; GFX8-NEXT: .LBB3_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX8-NEXT: s_add_i32 s2, s2, s6
-; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8-NEXT: s_cbranch_execz .LBB3_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_u32 v0, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB3_4:
-; GFX8-NEXT: s_endpgm
+; GFX8_ITERATIVE-LABEL: add_i32_varying_nouse:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_add_u32 v0, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB3_4:
+; GFX8_ITERATIVE-NEXT: s_endpgm
;
-; GFX9-LABEL: add_i32_varying_nouse:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: s_mov_b32 s2, 0
-; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX9-NEXT: s_add_i32 s2, s2, s6
-; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execz .LBB3_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: ds_add_u32 v0, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB3_4:
-; GFX9-NEXT: s_endpgm
+; GFX9_ITERATIVE-LABEL: add_i32_varying_nouse:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2
+; GFX9_ITERATIVE-NEXT: ds_add_u32 v0, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB3_4:
+; GFX9_ITERATIVE-NEXT: s_endpgm
;
-; GFX1064-LABEL: add_i32_varying_nouse:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[0:1], exec
-; GFX1064-NEXT: s_mov_b32 s2, 0
-; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX1064-NEXT: s_add_i32 s2, s2, s6
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execz .LBB3_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: ds_add_u32 v0, v1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB3_4:
-; GFX1064-NEXT: s_endpgm
+; GFX1064_ITERATIVE-LABEL: add_i32_varying_nouse:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1064_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064_ITERATIVE-NEXT: ds_add_u32 v0, v1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB3_4:
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
-; GFX1032-LABEL: add_i32_varying_nouse:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s1, exec_lo
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s2, s1
-; GFX1032-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032-NEXT: s_andn2_b32 s1, s1, s2
-; GFX1032-NEXT: s_add_i32 s0, s0, s3
-; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1032-NEXT: s_cbranch_execz .LBB3_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v1, s0
-; GFX1032-NEXT: ds_add_u32 v0, v1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB3_4:
-; GFX1032-NEXT: s_endpgm
+; GFX1032_ITERATIVE-LABEL: add_i32_varying_nouse:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0
+; GFX1032_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032_ITERATIVE-NEXT: ds_add_u32 v0, v1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB3_4:
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX1164-LABEL: add_i32_varying_nouse:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-NEXT: s_mov_b32 s2, 0
-; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: s_add_i32 s2, s2, s6
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execz .LBB3_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: ds_add_u32 v0, v1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB3_4:
-; GFX1164-NEXT: s_endpgm
+; GFX1164_ITERATIVE-LABEL: add_i32_varying_nouse:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1164_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164_ITERATIVE-NEXT: ds_add_u32 v0, v1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB3_4:
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
-; GFX1132-LABEL: add_i32_varying_nouse:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s1, exec_lo
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s2, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1132-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: s_add_i32 s0, s0, s3
-; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s1, exec_lo
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1132-NEXT: s_cbranch_execz .LBB3_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: ds_add_u32 v0, v1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB3_4:
-; GFX1132-NEXT: s_endpgm
+; GFX1132_ITERATIVE-LABEL: add_i32_varying_nouse:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0
+; GFX1132_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1132_ITERATIVE-NEXT: ds_add_u32 v0, v1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB3_4:
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: add_i32_varying_nouse:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: ds_add_u32 v1, v0
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: add_i32_varying_nouse:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX8_DPP-NEXT: s_mov_b32 s0, s2
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB3_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_add_u32 v2, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB3_2:
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: add_i32_varying_nouse:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT: s_mov_b32 s0, s2
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB3_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0
+; GFX9_DPP-NEXT: ds_add_u32 v2, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB3_2:
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: add_i32_varying_nouse:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 0
+; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB3_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0
+; GFX1064_DPP-NEXT: ds_add_u32 v0, v3
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB3_2:
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: add_i32_varying_nouse:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB3_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: ds_add_u32 v0, v3
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB3_2:
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: add_i32_varying_nouse:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB3_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: ds_add_u32 v0, v3
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB3_2:
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: add_i32_varying_nouse:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
+; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: ds_add_u32 v0, v3
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB3_2:
+; GFX1132_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel
@@ -920,8 +1514,6 @@ entry:
}
define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
-;
-;
; GFX7LESS-LABEL: add_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
@@ -1150,8 +1742,6 @@ entry:
}
define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) {
-;
-;
; GFX7LESS-LABEL: add_i64_uniform:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec
@@ -1428,359 +2018,1649 @@ entry:
}
define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: add_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s2, s2, s8
+; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB6_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB6_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s5, v1
+; GFX7LESS_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: add_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_add_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB6_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s5, v1
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: add_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_add_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX9_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB6_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: add_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1064_ITERATIVE-NEXT: s_add_u32 s2, s2, s7
+; GFX1064_ITERATIVE-NEXT: s_addc_u32 s3, s3, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB6_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
+; GFX1032_ITERATIVE-LABEL: add_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032_ITERATIVE-NEXT: s_add_u32 s2, s2, s6
+; GFX1032_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB6_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: add_i64_varying:
+; GFX1164_ITERATIVE-LABEL: add_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1164_ITERATIVE-NEXT: s_add_u32 s2, s2, s7
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_addc_u32 s3, s3, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB6_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: add_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132_ITERATIVE-NEXT: s_add_u32 s2, s2, s6
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB6_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: add_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: add_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB6_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB6_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v8, vcc, v0, v8, vcc
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: add_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB6_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX9_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB6_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s5, v7
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v8, vcc, v0, v8, vcc
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: add_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB6_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB6_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v11
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v12
+; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s3, v9
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s4, v10, vcc
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: add_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v3, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB6_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v11
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v12
+; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s3, v9
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s4, v10, vcc_lo
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: add_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s4, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s5, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB6_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s4
+; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB6_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s3, v8
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s4, v9, vcc
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: add_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v3, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v1, s3, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB6_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s3, v8
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s4, v9, vcc_lo
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %zext = zext i32 %lane to i64
+ %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel
+ store i64 %old, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @add_i64_varying_nouse() {
+; GFX7LESS_ITERATIVE-LABEL: add_i64_varying_nouse:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_nop 0
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4
+; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
+; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB7_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB7_4:
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: add_i64_varying_nouse:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
+; GFX8_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4
+; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
+; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB7_4:
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: add_i64_varying_nouse:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
+; GFX9_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4
+; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
+; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1
+; GFX9_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB7_4:
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: add_i64_varying_nouse:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4
+; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s5
+; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1
+; GFX1064_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB7_4:
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: add_i64_varying_nouse:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
+; GFX1032_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3
+; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s4
+; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB7_4:
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: add_i64_varying_nouse:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4
+; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s5
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1
+; GFX1164_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB7_4:
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: add_i64_varying_nouse:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
+; GFX1132_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3
+; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX1132_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB7_4:
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: add_i64_varying_nouse:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: ds_add_u64 v1, v[0:1]
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: add_i64_varying_nouse:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX8_DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_add_u64 v8, v[9:10]
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB7_2:
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: add_i64_varying_nouse:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v1, vcc, v3, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX9_DPP-NEXT: ds_add_u64 v8, v[9:10]
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB7_2:
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: add_i64_varying_nouse:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v3, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v4, vcc
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 0
+; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 0
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 32
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_add_u32 s0, s3, s4
+; GFX1064_DPP-NEXT: s_addc_u32 s1, s2, s5
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0
+; GFX1064_DPP-NEXT: ds_add_u64 v10, v[11:12]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB7_2:
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: add_i64_varying_nouse:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: ds_add_u64 v10, v[11:12]
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB7_2:
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: add_i64_varying_nouse:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v6
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v7
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v3, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: ds_add_u64 v7, v[8:9]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB7_2:
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: add_i64_varying_nouse:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v6
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v7
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: ds_add_u64 v7, v[8:9]
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB7_2:
+; GFX1132_DPP-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %zext = zext i32 %lane to i64
+ %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
+; GFX7LESS-LABEL: sub_i32_constant:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: s_mov_b32 m0, -1
+; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: .LBB8_2:
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX8-LABEL: add_i64_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[4:5], exec
-; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: s_mov_b64 s[2:3], 0
-; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX8-NEXT: .LBB6_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s6, s[4:5]
-; GFX8-NEXT: s_mov_b32 m0, s6
-; GFX8-NEXT: v_readlane_b32 s8, v0, s6
-; GFX8-NEXT: v_readlane_b32 s7, v3, s6
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_u32 s2, s2, s8
-; GFX8-NEXT: v_writelane_b32 v2, s3, m0
-; GFX8-NEXT: s_addc_u32 s3, s3, s7
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB6_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execz .LBB6_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB6_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v4
-; GFX8-NEXT: v_readfirstlane_b32 s5, v3
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v1
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: add_i64_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[4:5], exec
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX9-NEXT: .LBB6_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s6, s[4:5]
-; GFX9-NEXT: s_mov_b32 m0, s6
-; GFX9-NEXT: v_readlane_b32 s8, v0, s6
-; GFX9-NEXT: v_readlane_b32 s7, v3, s6
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_u32 s2, s2, s8
-; GFX9-NEXT: v_writelane_b32 v2, s3, m0
-; GFX9-NEXT: s_addc_u32 s3, s3, s7
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execz .LBB6_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB6_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v4
-; GFX9-NEXT: v_readfirstlane_b32 s5, v3
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
-;
-; GFX1064-LABEL: add_i64_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_mov_b64 s[4:5], exec
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s6, s[4:5]
-; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064-NEXT: v_writelane_b32 v1, s2, s6
-; GFX1064-NEXT: v_writelane_b32 v2, s3, s6
-; GFX1064-NEXT: s_add_u32 s2, s2, s7
-; GFX1064-NEXT: s_addc_u32 s3, s3, s8
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064-NEXT: s_cbranch_execz .LBB6_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v4, s3
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v3, s2
-; GFX1064-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB6_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v1
-; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
-;
-; GFX1032-LABEL: add_i64_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_mov_b32 s4, exec_lo
-; GFX1032-NEXT: s_mov_b64 s[2:3], 0
-; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT: .LBB6_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s5, s4
-; GFX1032-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1032-NEXT: v_readlane_b32 s7, v3, s5
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s5
-; GFX1032-NEXT: v_writelane_b32 v2, s3, s5
-; GFX1032-NEXT: s_add_u32 s2, s2, s6
-; GFX1032-NEXT: s_addc_u32 s3, s3, s7
-; GFX1032-NEXT: s_lshl_b32 s5, 1, s5
-; GFX1032-NEXT: s_andn2_b32 s4, s4, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s4, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s4
-; GFX1032-NEXT: s_cbranch_execz .LBB6_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v4, s3
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v3, s2
-; GFX1032-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB6_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
-; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
-;
-; GFX1164-LABEL: add_i64_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s6, s[4:5]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164-NEXT: v_writelane_b32 v1, s2, s6
-; GFX1164-NEXT: v_writelane_b32 v2, s3, s6
-; GFX1164-NEXT: s_add_u32 s2, s2, s7
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: s_addc_u32 s3, s3, s8
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1164-NEXT: s_cbranch_execz .LBB6_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v4, s3
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v3, s2
-; GFX1164-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB6_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_add_co_u32 v0, vcc, s2, v1
-; GFX1164-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
-;
-; GFX1132-LABEL: add_i64_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: s_mov_b64 s[2:3], 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s5, s4
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1132-NEXT: v_readlane_b32 s7, v3, s5
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s5
-; GFX1132-NEXT: v_writelane_b32 v2, s3, s5
-; GFX1132-NEXT: s_add_u32 s2, s2, s6
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: s_addc_u32 s3, s3, s7
-; GFX1132-NEXT: s_lshl_b32 s5, 1, s5
-; GFX1132-NEXT: s_and_not1_b32 s4, s4, s5
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_cmp_lg_u32 s4, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_xor_b32 s4, exec_lo, s4
-; GFX1132-NEXT: s_cbranch_execz .LBB6_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v4, s3
-; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
-; GFX1132-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB6_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
-; GFX1132-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
-entry:
- %lane = call i32 @llvm.amdgcn.workitem.id.x()
- %zext = zext i32 %lane to i64
- %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel
- store i64 %old, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
-;
-;
-; GFX7LESS-LABEL: sub_i32_constant:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: ; implicit-def: $vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2
-; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: .LBB7_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
-;
-; GFX8-LABEL: sub_i32_constant:
+; GFX8-LABEL: sub_i32_constant:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b64 s[4:5], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
@@ -1788,7 +3668,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_cbranch_execz .LBB7_2
+; GFX8-NEXT: s_cbranch_execz .LBB8_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX8-NEXT: s_mul_i32 s4, s4, 5
@@ -1797,7 +3677,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB7_2:
+; GFX8-NEXT: .LBB8_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
@@ -1817,7 +3697,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB7_2
+; GFX9-NEXT: s_cbranch_execz .LBB8_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX9-NEXT: s_mul_i32 s4, s4, 5
@@ -1825,7 +3705,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB7_2:
+; GFX9-NEXT: .LBB8_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
@@ -1845,7 +3725,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB7_2
+; GFX1064-NEXT: s_cbranch_execz .LBB8_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
@@ -1854,7 +3734,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB7_2:
+; GFX1064-NEXT: .LBB8_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1874,7 +3754,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB7_2
+; GFX1032-NEXT: s_cbranch_execz .LBB8_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
@@ -1883,7 +3763,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB7_2:
+; GFX1032-NEXT: .LBB8_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1905,7 +3785,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB7_2
+; GFX1164-NEXT: s_cbranch_execz .LBB8_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
@@ -1915,7 +3795,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB7_2:
+; GFX1164-NEXT: .LBB8_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
@@ -1938,7 +3818,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB7_2
+; GFX1132-NEXT: s_cbranch_execz .LBB8_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1947,7 +3827,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB7_2:
+; GFX1132-NEXT: .LBB8_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
@@ -1968,8 +3848,6 @@ entry:
}
define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) {
-;
-;
; GFX7LESS-LABEL: sub_i32_uniform:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
@@ -1979,7 +3857,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2
+; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -1989,7 +3867,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: .LBB8_2:
+; GFX7LESS-NEXT: .LBB9_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
@@ -2010,7 +3888,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_cbranch_execz .LBB8_2
+; GFX8-NEXT: s_cbranch_execz .LBB9_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2020,7 +3898,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB8_2:
+; GFX8-NEXT: .LBB9_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2041,7 +3919,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB8_2
+; GFX9-NEXT: s_cbranch_execz .LBB9_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -2050,7 +3928,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB8_2:
+; GFX9-NEXT: .LBB9_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -2071,7 +3949,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB8_2
+; GFX1064-NEXT: s_cbranch_execz .LBB9_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
@@ -2081,7 +3959,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB8_2:
+; GFX1064-NEXT: .LBB9_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2102,7 +3980,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB8_2
+; GFX1032-NEXT: s_cbranch_execz .LBB9_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
@@ -2112,7 +3990,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB8_2:
+; GFX1032-NEXT: .LBB9_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2135,7 +4013,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB8_2
+; GFX1164-NEXT: s_cbranch_execz .LBB9_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
@@ -2146,7 +4024,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB8_2:
+; GFX1164-NEXT: .LBB9_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -2170,7 +4048,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB8_2
+; GFX1132-NEXT: s_cbranch_execz .LBB9_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -2180,7 +4058,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB8_2:
+; GFX1132-NEXT: .LBB9_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -2201,514 +4079,1103 @@ entry:
}
define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB10_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB10_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: sub_i32_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX8_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB10_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: sub_i32_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX9_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB10_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: sub_i32_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1064_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s4, s4, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB10_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: sub_i32_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1032_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032_ITERATIVE-NEXT: s_add_i32 s2, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB10_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: sub_i32_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1164_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s4, s4, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB10_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: sub_i32_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1132_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132_ITERATIVE-NEXT: s_add_i32 s2, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB10_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: sub_i32_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_sub_rtn_u32 v0, v1, v0
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: sub_i32_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB10_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB10_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: sub_i32_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB10_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB10_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: sub_i32_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB10_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1064_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB10_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: sub_i32_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s3
+; GFX1032_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB10_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: sub_i32_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB10_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1164_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB10_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: sub_i32_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s3
+; GFX1132_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB10_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel
+ store i32 %old, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @sub_i32_varying_nouse() {
+; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying_nouse:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB11_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_sub_u32 v0, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB11_4:
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: sub_i32_varying_nouse:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_sub_u32 v0, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB11_4:
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: sub_i32_varying_nouse:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2
+; GFX9_ITERATIVE-NEXT: ds_sub_u32 v0, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB11_4:
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: sub_i32_varying_nouse:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064_ITERATIVE-NEXT: ds_sub_u32 v0, v1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB11_4:
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: sub_i32_varying_nouse:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0
+; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032_ITERATIVE-NEXT: ds_sub_u32 v0, v1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB11_4:
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: sub_i32_varying_nouse:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164_ITERATIVE-NEXT: ds_sub_u32 v0, v1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB11_4:
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
+; GFX1132_ITERATIVE-LABEL: sub_i32_varying_nouse:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0
+; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1132_ITERATIVE-NEXT: ds_sub_u32 v0, v1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB11_4:
+; GFX1132_ITERATIVE-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: sub_i32_varying:
+; GFX7LESS_DPP-LABEL: sub_i32_varying_nouse:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: ds_sub_u32 v1, v0
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: sub_i32_varying_nouse:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX8_DPP-NEXT: s_mov_b32 s0, s2
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_sub_u32 v2, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB11_2:
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: sub_i32_varying_nouse:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT: s_mov_b32 s0, s2
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0
+; GFX9_DPP-NEXT: ds_sub_u32 v2, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB11_2:
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: sub_i32_varying_nouse:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 0
+; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0
+; GFX1064_DPP-NEXT: ds_sub_u32 v0, v3
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB11_2:
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: sub_i32_varying_nouse:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: ds_sub_u32 v0, v3
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB11_2:
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: sub_i32_varying_nouse:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: ds_sub_u32 v0, v3
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB11_2:
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: sub_i32_varying_nouse:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
+; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: ds_sub_u32 v0, v3
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB11_2:
+; GFX1132_DPP-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
+; GFX7LESS-LABEL: sub_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4
; GFX7LESS-NEXT: s_mov_b32 m0, -1
+; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: .LBB12_2:
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0
+; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0
+; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX8-LABEL: sub_i32_varying:
+; GFX8-LABEL: sub_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s4, 0
-; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: .LBB9_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s5
-; GFX8-NEXT: v_readlane_b32 s8, v0, s5
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: v_writelane_b32 v1, s4, m0
-; GFX8-NEXT: s_add_i32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB9_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8-NEXT: s_cbranch_execz .LBB9_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB9_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: sub_i32_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: .LBB9_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s5
-; GFX9-NEXT: v_readlane_b32 s8, v0, s5
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: v_writelane_b32 v1, s4, m0
-; GFX9-NEXT: s_add_i32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execz .LBB9_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB9_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
-;
-; GFX1064-LABEL: sub_i32_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s4, 0
-; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064-NEXT: s_add_i32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execz .LBB9_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s4
-; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v2
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB9_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
-;
-; GFX1032-LABEL: sub_i32_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_add_i32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1032-NEXT: s_cbranch_execz .LBB9_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
-; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB9_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
-;
-; GFX1164-LABEL: sub_i32_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s4, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
-; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT: s_add_i32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execz .LBB9_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v2
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB9_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
-;
-; GFX1132-LABEL: sub_i32_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
-; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_add_i32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1132-NEXT: s_cbranch_execz .LBB9_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v2
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB9_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
-entry:
- %lane = call i32 @llvm.amdgcn.workitem.id.x()
- %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel
- store i32 %old, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @sub_i32_varying_nouse() {
-; GFX7LESS-LABEL: sub_i32_varying_nouse:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: ds_sub_u32 v1, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_endpgm
-;
-; GFX8-LABEL: sub_i32_varying_nouse:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[0:1], exec
-; GFX8-NEXT: s_mov_b32 s2, 0
-; GFX8-NEXT: .LBB10_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX8-NEXT: s_add_i32 s2, s2, s6
-; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB10_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8-NEXT: s_cbranch_execz .LBB10_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_sub_u32 v0, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB10_4:
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: sub_i32_varying_nouse:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: s_mov_b32 s2, 0
-; GFX9-NEXT: .LBB10_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX9-NEXT: s_add_i32 s2, s2, s6
-; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execz .LBB10_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: ds_sub_u32 v0, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB10_4:
-; GFX9-NEXT: s_endpgm
-;
-; GFX1064-LABEL: sub_i32_varying_nouse:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[0:1], exec
-; GFX1064-NEXT: s_mov_b32 s2, 0
-; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX1064-NEXT: s_add_i32 s2, s2, s6
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execz .LBB10_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: ds_sub_u32 v0, v1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB10_4:
-; GFX1064-NEXT: s_endpgm
-;
-; GFX1032-LABEL: sub_i32_varying_nouse:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s1, exec_lo
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s2, s1
-; GFX1032-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032-NEXT: s_andn2_b32 s1, s1, s2
-; GFX1032-NEXT: s_add_i32 s0, s0, s3
-; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1032-NEXT: s_cbranch_execz .LBB10_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v1, s0
-; GFX1032-NEXT: ds_sub_u32 v0, v1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB10_4:
-; GFX1032-NEXT: s_endpgm
-;
-; GFX1164-LABEL: sub_i32_varying_nouse:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-NEXT: s_mov_b32 s2, 0
-; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: s_add_i32 s2, s2, s6
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execz .LBB10_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: ds_sub_u32 v0, v1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB10_4:
-; GFX1164-NEXT: s_endpgm
-;
-; GFX1132-LABEL: sub_i32_varying_nouse:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s1, exec_lo
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s2, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1132-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: s_add_i32 s0, s0, s3
-; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s1, exec_lo
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1132-NEXT: s_cbranch_execz .LBB10_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: ds_sub_u32 v0, v1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB10_4:
-; GFX1132-NEXT: s_endpgm
-entry:
- %lane = call i32 @llvm.amdgcn.workitem.id.x()
- %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel
- ret void
-}
-
-define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
-;
-;
-; GFX7LESS-LABEL: sub_i64_constant:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2
-; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: .LBB11_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0
-; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0
-; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
-;
-; GFX8-LABEL: sub_i64_constant:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[4:5], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_cbranch_execz .LBB11_2
+; GFX8-NEXT: s_cbranch_execz .LBB12_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX8-NEXT: s_mul_i32 s4, s4, 5
@@ -2717,7 +5184,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB11_2:
+; GFX8-NEXT: .LBB12_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
@@ -2741,7 +5208,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB11_2
+; GFX9-NEXT: s_cbranch_execz .LBB12_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX9-NEXT: s_mul_i32 s4, s4, 5
@@ -2749,7 +5216,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB11_2:
+; GFX9-NEXT: .LBB12_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
@@ -2773,7 +5240,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB11_2
+; GFX1064-NEXT: s_cbranch_execz .LBB12_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
@@ -2782,7 +5249,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB11_2:
+; GFX1064-NEXT: .LBB12_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2805,7 +5272,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB11_2
+; GFX1032-NEXT: s_cbranch_execz .LBB12_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
@@ -2814,7 +5281,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB11_2:
+; GFX1032-NEXT: .LBB12_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2839,7 +5306,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
-; GFX1164-NEXT: s_cbranch_execz .LBB11_2
+; GFX1164-NEXT: s_cbranch_execz .LBB12_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
@@ -2849,7 +5316,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB11_2:
+; GFX1164-NEXT: .LBB12_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
@@ -2875,7 +5342,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
-; GFX1132-NEXT: s_cbranch_execz .LBB11_2
+; GFX1132-NEXT: s_cbranch_execz .LBB12_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
@@ -2885,7 +5352,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB11_2:
+; GFX1132-NEXT: .LBB12_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
@@ -2909,8 +5376,6 @@ entry:
}
define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) {
-;
-;
; GFX7LESS-LABEL: sub_i64_uniform:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec
@@ -2920,7 +5385,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2
+; GFX7LESS-NEXT: s_cbranch_execz .LBB13_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0
@@ -2934,7 +5399,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: .LBB12_2:
+; GFX7LESS-NEXT: .LBB13_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s6, -1
@@ -2962,7 +5427,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_cbranch_execz .LBB12_2
+; GFX8-NEXT: s_cbranch_execz .LBB13_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s8
@@ -2974,7 +5439,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB12_2:
+; GFX8-NEXT: .LBB13_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s4, s0
@@ -3001,7 +5466,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB12_2
+; GFX9-NEXT: s_cbranch_execz .LBB13_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -3014,7 +5479,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB12_2:
+; GFX9-NEXT: .LBB13_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
@@ -3041,7 +5506,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB12_2
+; GFX1064-NEXT: s_cbranch_execz .LBB13_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
@@ -3055,7 +5520,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB12_2:
+; GFX1064-NEXT: .LBB13_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -3079,7 +5544,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB12_2
+; GFX1032-NEXT: s_cbranch_execz .LBB13_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
@@ -3093,7 +5558,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB12_2:
+; GFX1032-NEXT: .LBB13_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -3119,7 +5584,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
-; GFX1164-NEXT: s_cbranch_execz .LBB12_2
+; GFX1164-NEXT: s_cbranch_execz .LBB13_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
@@ -3133,7 +5598,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB12_2:
+; GFX1164-NEXT: .LBB13_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
@@ -3161,7 +5626,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
-; GFX1132-NEXT: s_cbranch_execz .LBB12_2
+; GFX1132-NEXT: s_cbranch_execz .LBB13_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
@@ -3175,7 +5640,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB12_2:
+; GFX1132-NEXT: .LBB13_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
@@ -3200,318 +5665,951 @@ entry:
}
define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s2, s2, s8
+; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB14_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB14_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s5, v1
+; GFX7LESS_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
;
+; GFX8_ITERATIVE-LABEL: sub_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_add_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB14_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s5, v1
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: sub_i64_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
+; GFX9_ITERATIVE-LABEL: sub_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_add_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB14_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
;
-; GFX8-LABEL: sub_i64_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[4:5], exec
-; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: s_mov_b64 s[2:3], 0
-; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX8-NEXT: .LBB13_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s6, s[4:5]
-; GFX8-NEXT: s_mov_b32 m0, s6
-; GFX8-NEXT: v_readlane_b32 s8, v0, s6
-; GFX8-NEXT: v_readlane_b32 s7, v3, s6
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_u32 s2, s2, s8
-; GFX8-NEXT: v_writelane_b32 v2, s3, m0
-; GFX8-NEXT: s_addc_u32 s3, s3, s7
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB13_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execz .LBB13_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB13_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v4
-; GFX8-NEXT: v_readfirstlane_b32 s5, v3
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v1
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
+; GFX1064_ITERATIVE-LABEL: sub_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1064_ITERATIVE-NEXT: s_add_u32 s2, s2, s7
+; GFX1064_ITERATIVE-NEXT: s_addc_u32 s3, s3, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB14_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
-; GFX9-LABEL: sub_i64_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[4:5], exec
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX9-NEXT: .LBB13_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s6, s[4:5]
-; GFX9-NEXT: s_mov_b32 m0, s6
-; GFX9-NEXT: v_readlane_b32 s8, v0, s6
-; GFX9-NEXT: v_readlane_b32 s7, v3, s6
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_u32 s2, s2, s8
-; GFX9-NEXT: v_writelane_b32 v2, s3, m0
-; GFX9-NEXT: s_addc_u32 s3, s3, s7
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB13_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execz .LBB13_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB13_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v4
-; GFX9-NEXT: v_readfirstlane_b32 s5, v3
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
+; GFX1032_ITERATIVE-LABEL: sub_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032_ITERATIVE-NEXT: s_add_u32 s2, s2, s6
+; GFX1032_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB14_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX1064-LABEL: sub_i64_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_mov_b64 s[4:5], exec
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1064-NEXT: .LBB13_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s6, s[4:5]
-; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064-NEXT: v_writelane_b32 v1, s2, s6
-; GFX1064-NEXT: v_writelane_b32 v2, s3, s6
-; GFX1064-NEXT: s_add_u32 s2, s2, s7
-; GFX1064-NEXT: s_addc_u32 s3, s3, s8
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB13_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064-NEXT: s_cbranch_execz .LBB13_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v4, s3
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v3, s2
-; GFX1064-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB13_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
+; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1164_ITERATIVE-NEXT: s_add_u32 s2, s2, s7
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_addc_u32 s3, s3, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB14_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
-; GFX1032-LABEL: sub_i64_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_mov_b32 s4, exec_lo
-; GFX1032-NEXT: s_mov_b64 s[2:3], 0
-; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT: .LBB13_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s5, s4
-; GFX1032-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1032-NEXT: v_readlane_b32 s7, v3, s5
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s5
-; GFX1032-NEXT: v_writelane_b32 v2, s3, s5
-; GFX1032-NEXT: s_add_u32 s2, s2, s6
-; GFX1032-NEXT: s_addc_u32 s3, s3, s7
-; GFX1032-NEXT: s_lshl_b32 s5, 1, s5
-; GFX1032-NEXT: s_andn2_b32 s4, s4, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s4, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB13_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s4
-; GFX1032-NEXT: s_cbranch_execz .LBB13_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v4, s3
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v3, s2
-; GFX1032-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB13_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
+; GFX1132_ITERATIVE-LABEL: sub_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132_ITERATIVE-NEXT: s_add_u32 s2, s2, s6
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB14_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
;
-; GFX1164-LABEL: sub_i64_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1164-NEXT: .LBB13_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s6, s[4:5]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164-NEXT: v_writelane_b32 v1, s2, s6
-; GFX1164-NEXT: v_writelane_b32 v2, s3, s6
-; GFX1164-NEXT: s_add_u32 s2, s2, s7
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: s_addc_u32 s3, s3, s8
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB13_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1164-NEXT: s_cbranch_execz .LBB13_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v4, s3
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v3, s2
-; GFX1164-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB13_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v1
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
+; GFX7LESS_DPP-LABEL: sub_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
;
-; GFX1132-LABEL: sub_i64_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: s_mov_b64 s[2:3], 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX1132-NEXT: .LBB13_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s5, s4
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1132-NEXT: v_readlane_b32 s7, v3, s5
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s5
-; GFX1132-NEXT: v_writelane_b32 v2, s3, s5
-; GFX1132-NEXT: s_add_u32 s2, s2, s6
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: s_addc_u32 s3, s3, s7
-; GFX1132-NEXT: s_lshl_b32 s5, 1, s5
-; GFX1132-NEXT: s_and_not1_b32 s4, s4, s5
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_cmp_lg_u32 s4, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB13_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_xor_b32 s4, exec_lo, s4
-; GFX1132-NEXT: s_cbranch_execz .LBB13_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v4, s3
-; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
-; GFX1132-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB13_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
+; GFX8_DPP-LABEL: sub_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB14_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB14_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_subb_u32_e32 v8, vcc, v0, v8, vcc
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: sub_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB14_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB14_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s5, v7
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_subb_co_u32_e32 v8, vcc, v0, v8, vcc
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: sub_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB14_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB14_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v11
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v12
+; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s3, v9
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s4, v10, vcc
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: sub_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v3, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB14_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v11
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v12
+; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s3, v9
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s4, v10, vcc_lo
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: sub_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s4, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s5, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB14_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s4
+; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB14_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s3, v8
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s4, v9, vcc
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: sub_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v3, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v1, s3, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB14_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s3, v8
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s4, v9, vcc_lo
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
@@ -3521,272 +6619,634 @@ entry:
}
define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: and_i32_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, -1
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_and_b32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB15_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB15_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
;
+; GFX8_ITERATIVE-LABEL: and_i32_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, -1
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX8_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_and_b32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB15_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: and_i32_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
+; GFX9_ITERATIVE-LABEL: and_i32_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, -1
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX9_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_and_b32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX9_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB15_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
;
-; GFX8-LABEL: and_i32_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s4, -1
-; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: .LBB14_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s5
-; GFX8-NEXT: v_readlane_b32 s8, v0, s5
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: v_writelane_b32 v1, s4, m0
-; GFX8-NEXT: s_and_b32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB14_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8-NEXT: s_cbranch_execz .LBB14_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB14_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_and_b32_e32 v0, s4, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
+; GFX1064_ITERATIVE-LABEL: and_i32_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, -1
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1064_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_and_b32 s4, s4, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB15_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
-; GFX9-LABEL: and_i32_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s4, -1
-; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: .LBB14_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s5
-; GFX9-NEXT: v_readlane_b32 s8, v0, s5
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: v_writelane_b32 v1, s4, m0
-; GFX9-NEXT: s_and_b32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execz .LBB14_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB14_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_and_b32_e32 v0, s4, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
+; GFX1032_ITERATIVE-LABEL: and_i32_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1032_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032_ITERATIVE-NEXT: s_and_b32 s2, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB15_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX1064-LABEL: and_i32_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s4, -1
-; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064-NEXT: s_and_b32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execz .LBB14_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s4
-; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v2
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB14_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
+; GFX1164_ITERATIVE-LABEL: and_i32_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, -1
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1164_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_and_b32 s4, s4, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB15_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
-; GFX1032-LABEL: and_i32_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_and_b32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1032-NEXT: s_cbranch_execz .LBB14_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
-; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB14_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
+; GFX1132_ITERATIVE-LABEL: and_i32_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1132_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132_ITERATIVE-NEXT: s_and_b32 s2, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB15_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
;
-; GFX1164-LABEL: and_i32_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s4, -1
-; GFX1164-NEXT: ; implicit-def: $vgpr1
-; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT: s_and_b32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execz .LBB14_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v2
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB14_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
+; GFX7LESS_DPP-LABEL: and_i32_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_and_rtn_b32 v0, v1, v0
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
;
-; GFX1132-LABEL: and_i32_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: ; implicit-def: $vgpr1
-; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_and_b32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1132-NEXT: s_cbranch_execz .LBB14_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v2
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB14_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
+; GFX8_DPP-LABEL: and_i32_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB15_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB15_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_and_b32_e32 v0, s4, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: and_i32_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB15_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB15_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_and_b32_e32 v0, s4, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: and_i32_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB15_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1064_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB15_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: and_i32_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB15_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: and_i32_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB15_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1164_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB15_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: and_i32_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB15_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw and ptr addrspace(3) @local_var32, i32 %lane acq_rel
@@ -3794,1372 +7254,4334 @@ entry:
ret void
}
-define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
+define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: and_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9]
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB16_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB16_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
;
+; GFX8_ITERATIVE-LABEL: and_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4]
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB16_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: or_i32_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
+; GFX9_ITERATIVE-LABEL: and_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX9_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4]
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB16_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
;
-; GFX8-LABEL: or_i32_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s4, 0
-; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: .LBB15_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s5
-; GFX8-NEXT: v_readlane_b32 s8, v0, s5
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: v_writelane_b32 v1, s4, m0
-; GFX8-NEXT: s_or_b32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB15_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8-NEXT: s_cbranch_execz .LBB15_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB15_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_or_b32_e32 v0, s4, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
+; GFX1064_ITERATIVE-LABEL: and_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB16_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2
+; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
-; GFX9-LABEL: or_i32_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: .LBB15_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s5
-; GFX9-NEXT: v_readlane_b32 s8, v0, s5
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: v_writelane_b32 v1, s4, m0
-; GFX9-NEXT: s_or_b32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execz .LBB15_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB15_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_or_b32_e32 v0, s4, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
+; GFX1032_ITERATIVE-LABEL: and_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8
+; GFX1032_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB16_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2
+; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX1064-LABEL: or_i32_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s4, 0
-; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064-NEXT: s_or_b32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execz .LBB15_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s4
-; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v2
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB15_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
+; GFX1164_ITERATIVE-LABEL: and_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9]
+; GFX1164_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB16_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
-; GFX1032-LABEL: or_i32_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_or_b32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1032-NEXT: s_cbranch_execz .LBB15_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
-; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB15_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
+; GFX1132_ITERATIVE-LABEL: and_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8
+; GFX1132_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB16_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2
+; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
;
-; GFX1164-LABEL: or_i32_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s4, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
-; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT: s_or_b32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execz .LBB15_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v2
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB15_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
+; GFX7LESS_DPP-LABEL: and_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_and_rtn_b64 v[0:1], v1, v[0:1]
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
;
-; GFX1132-LABEL: or_i32_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
-; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_or_b32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1132-NEXT: s_cbranch_execz .LBB15_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v2
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB15_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
+; GFX8_DPP-LABEL: and_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB16_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB16_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_and_b32_e32 v6, s4, v6
+; GFX8_DPP-NEXT: v_and_b32_e32 v5, s5, v5
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: and_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB16_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB16_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_and_b32_e32 v6, s4, v6
+; GFX9_DPP-NEXT: v_and_b32_e32 v5, s5, v5
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: and_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB16_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB16_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s3, v8
+; GFX1064_DPP-NEXT: v_and_b32_e32 v7, s4, v7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: and_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB16_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s3, v8
+; GFX1032_DPP-NEXT: v_and_b32_e32 v7, s4, v7
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: and_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB16_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB16_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s3, v8
+; GFX1164_DPP-NEXT: v_and_b32_e32 v7, s4, v7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: and_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
+; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB16_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s3, v8
+; GFX1132_DPP-NEXT: v_and_b32_e32 v7, s4, v7
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
- %old = atomicrmw or ptr addrspace(3) @local_var32, i32 %lane acq_rel
- store i32 %old, ptr addrspace(1) %out
+ %lane_ext = zext i32 %lane to i64
+ %old = atomicrmw and ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
+ store i64 %old, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
+define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: or_i32_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_or_b32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB17_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB17_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
;
+; GFX8_ITERATIVE-LABEL: or_i32_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX8_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_or_b32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB17_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: xor_i32_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
+; GFX9_ITERATIVE-LABEL: or_i32_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX9_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_or_b32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX9_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB17_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
;
-; GFX8-LABEL: xor_i32_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s4, 0
-; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: .LBB16_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s5
-; GFX8-NEXT: v_readlane_b32 s8, v0, s5
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: v_writelane_b32 v1, s4, m0
-; GFX8-NEXT: s_xor_b32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB16_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8-NEXT: s_cbranch_execz .LBB16_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB16_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
+; GFX1064_ITERATIVE-LABEL: or_i32_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1064_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_or_b32 s4, s4, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB17_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
-; GFX9-LABEL: xor_i32_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: .LBB16_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s5
-; GFX9-NEXT: v_readlane_b32 s8, v0, s5
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: v_writelane_b32 v1, s4, m0
-; GFX9-NEXT: s_xor_b32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB16_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execz .LBB16_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB16_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
+; GFX1032_ITERATIVE-LABEL: or_i32_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1032_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032_ITERATIVE-NEXT: s_or_b32 s2, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB17_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX1064-LABEL: xor_i32_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s4, 0
-; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: .LBB16_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064-NEXT: s_xor_b32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB16_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execz .LBB16_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s4
-; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v2
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB16_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
+; GFX1164_ITERATIVE-LABEL: or_i32_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1164_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_or_b32 s4, s4, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB17_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
-; GFX1032-LABEL: xor_i32_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: .LBB16_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_xor_b32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB16_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1032-NEXT: s_cbranch_execz .LBB16_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
-; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB16_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
+; GFX1132_ITERATIVE-LABEL: or_i32_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1132_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132_ITERATIVE-NEXT: s_or_b32 s2, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB17_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
;
-; GFX1164-LABEL: xor_i32_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s4, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
-; GFX1164-NEXT: .LBB16_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT: s_xor_b32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB16_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execz .LBB16_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v2
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB16_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
+; GFX7LESS_DPP-LABEL: or_i32_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_or_rtn_b32 v0, v1, v0
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
;
-; GFX1132-LABEL: xor_i32_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
-; GFX1132-NEXT: .LBB16_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_xor_b32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1132-NEXT: s_cbranch_execz .LBB16_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v2
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB16_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
+; GFX8_DPP-LABEL: or_i32_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB17_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB17_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_or_b32_e32 v0, s4, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: or_i32_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB17_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB17_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_or_b32_e32 v0, s4, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: or_i32_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB17_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1064_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB17_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: or_i32_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s3
+; GFX1032_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB17_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: or_i32_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB17_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1164_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB17_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: or_i32_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s3
+; GFX1132_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB17_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
- %old = atomicrmw xor ptr addrspace(3) @local_var32, i32 %lane acq_rel
+ %old = atomicrmw or ptr addrspace(3) @local_var32, i32 %lane acq_rel
store i32 %old, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
+define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: or_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9]
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB18_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB18_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2
+; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
;
+; GFX8_ITERATIVE-LABEL: or_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4]
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB18_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2
+; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: max_i32_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
+; GFX9_ITERATIVE-LABEL: or_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX9_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4]
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB18_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2
+; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
;
-; GFX8-LABEL: max_i32_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_brev_b32 s4, 1
-; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: .LBB17_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s5
-; GFX8-NEXT: v_readlane_b32 s8, v0, s5
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: v_writelane_b32 v1, s4, m0
-; GFX8-NEXT: s_max_i32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB17_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8-NEXT: s_cbranch_execz .LBB17_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB17_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_max_i32_e32 v0, s4, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
+; GFX1064_ITERATIVE-LABEL: or_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB18_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2
+; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
-; GFX9-LABEL: max_i32_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_brev_b32 s4, 1
-; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: .LBB17_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s5
-; GFX9-NEXT: v_readlane_b32 s8, v0, s5
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: v_writelane_b32 v1, s4, m0
-; GFX9-NEXT: s_max_i32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execz .LBB17_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB17_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_max_i32_e32 v0, s4, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
+; GFX1032_ITERATIVE-LABEL: or_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8
+; GFX1032_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB18_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2
+; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX1064-LABEL: max_i32_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_brev_b32 s4, 1
-; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064-NEXT: s_max_i32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execz .LBB17_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s4
-; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v2
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB17_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
+; GFX1164_ITERATIVE-LABEL: or_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9]
+; GFX1164_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB18_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2
+; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
-; GFX1032-LABEL: max_i32_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_brev_b32 s2, 1
-; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_max_i32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1032-NEXT: s_cbranch_execz .LBB17_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
-; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB17_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
+; GFX1132_ITERATIVE-LABEL: or_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8
+; GFX1132_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB18_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2
+; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
;
-; GFX1164-LABEL: max_i32_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_brev_b32 s4, 1
-; GFX1164-NEXT: ; implicit-def: $vgpr1
-; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT: s_max_i32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execz .LBB17_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v2
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB17_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
+; GFX7LESS_DPP-LABEL: or_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_or_rtn_b64 v[0:1], v1, v[0:1]
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
;
-; GFX1132-LABEL: max_i32_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_brev_b32 s2, 1
-; GFX1132-NEXT: ; implicit-def: $vgpr1
-; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_max_i32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1132-NEXT: s_cbranch_execz .LBB17_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v2
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB17_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
+; GFX8_DPP-LABEL: or_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB18_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB18_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_or_b32_e32 v6, s4, v6
+; GFX8_DPP-NEXT: v_or_b32_e32 v5, s5, v5
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: or_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB18_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB18_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_or_b32_e32 v6, s4, v6
+; GFX9_DPP-NEXT: v_or_b32_e32 v5, s5, v5
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: or_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB18_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB18_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s3, v8
+; GFX1064_DPP-NEXT: v_or_b32_e32 v7, s4, v7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: or_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB18_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s3, v8
+; GFX1032_DPP-NEXT: v_or_b32_e32 v7, s4, v7
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: or_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB18_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB18_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s3, v8
+; GFX1164_DPP-NEXT: v_or_b32_e32 v7, s4, v7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: or_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
+; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB18_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s3, v8
+; GFX1132_DPP-NEXT: v_or_b32_e32 v7, s4, v7
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
- %old = atomicrmw max ptr addrspace(3) @local_var32, i32 %lane acq_rel
- store i32 %old, ptr addrspace(1) %out
+ %lane_ext = zext i32 %lane to i64
+ %old = atomicrmw or ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
+ store i64 %old, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
+define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: xor_i32_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB19_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB19_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
;
+; GFX8_ITERATIVE-LABEL: xor_i32_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX8_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB19_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: max_i64_constant:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2
-; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: .LBB18_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
+; GFX9_ITERATIVE-LABEL: xor_i32_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX9_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB19_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
;
-; GFX8-LABEL: max_i64_constant:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_cbranch_execz .LBB18_2
-; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: v_mov_b32_e32 v0, 5
-; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, 0
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB18_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
+; GFX1064_ITERATIVE-LABEL: xor_i32_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1064_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB19_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
-; GFX9-LABEL: max_i64_constant:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB18_2
-; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: v_mov_b32_e32 v0, 5
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB18_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
+; GFX1032_ITERATIVE-LABEL: xor_i32_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1032_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB19_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX1064-LABEL: max_i64_constant:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB18_2
-; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 5
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB18_2:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
+; GFX1164_ITERATIVE-LABEL: xor_i32_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1164_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB19_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
-; GFX1032-LABEL: max_i64_constant:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB18_2
-; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 5
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB18_2:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
+; GFX1132_ITERATIVE-LABEL: xor_i32_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1132_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB19_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
;
-; GFX1164-LABEL: max_i64_constant:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_cbranch_execz .LBB18_2
-; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 5
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB18_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
+; GFX7LESS_DPP-LABEL: xor_i32_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_xor_rtn_b32 v0, v1, v0
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
;
-; GFX1132-LABEL: max_i64_constant:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1132-NEXT: s_cbranch_execz .LBB18_2
-; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 5
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
-; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB18_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
+; GFX8_DPP-LABEL: xor_i32_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB19_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB19_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_xor_b32_e32 v0, s4, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: xor_i32_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB19_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB19_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_xor_b32_e32 v0, s4, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: xor_i32_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB19_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1064_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB19_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT: v_xor_b32_e32 v0, s3, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: xor_i32_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s3
+; GFX1032_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB19_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT: v_xor_b32_e32 v0, s3, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: xor_i32_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB19_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1164_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB19_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_xor_b32_e32 v0, s3, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: xor_i32_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s3
+; GFX1132_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB19_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_xor_b32_e32 v0, s3, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
entry:
- %old = atomicrmw max ptr addrspace(3) @local_var64, i64 5 acq_rel
- store i64 %old, ptr addrspace(1) %out
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw xor ptr addrspace(3) @local_var32, i32 %lane acq_rel
+ store i32 %old, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
+define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: xor_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9]
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB20_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB20_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
;
+; GFX8_ITERATIVE-LABEL: xor_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4]
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB20_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: min_i32_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
+; GFX9_ITERATIVE-LABEL: xor_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4]
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB20_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
;
-; GFX8-LABEL: min_i32_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_brev_b32 s4, -2
-; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: .LBB19_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s5
-; GFX8-NEXT: v_readlane_b32 s8, v0, s5
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: v_writelane_b32 v1, s4, m0
-; GFX8-NEXT: s_min_i32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB19_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8-NEXT: s_cbranch_execz .LBB19_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB19_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_min_i32_e32 v0, s4, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
+; GFX1064_ITERATIVE-LABEL: xor_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB20_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2
+; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
-; GFX9-LABEL: min_i32_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: .LBB19_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s5
-; GFX9-NEXT: v_readlane_b32 s8, v0, s5
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: v_writelane_b32 v1, s4, m0
-; GFX9-NEXT: s_min_i32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB19_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execz .LBB19_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB19_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_min_i32_e32 v0, s4, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
+; GFX1032_ITERATIVE-LABEL: xor_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8
+; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB20_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2
+; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX1064-LABEL: min_i32_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_brev_b32 s4, -2
-; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: .LBB19_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064-NEXT: s_min_i32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB19_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execz .LBB19_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s4
-; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v2
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB19_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
+; GFX1164_ITERATIVE-LABEL: xor_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9]
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB20_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2
+; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
-; GFX1032-LABEL: min_i32_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_brev_b32 s2, -2
-; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: .LBB19_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_min_i32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB19_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1032-NEXT: s_cbranch_execz .LBB19_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
-; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB19_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
+; GFX1132_ITERATIVE-LABEL: xor_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8
+; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4]
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB20_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2
+; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
;
-; GFX1164-LABEL: min_i32_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_brev_b32 s4, -2
-; GFX1164-NEXT: ; implicit-def: $vgpr1
-; GFX1164-NEXT: .LBB19_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT: s_min_i32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB19_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execz .LBB19_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v2
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB19_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
+; GFX7LESS_DPP-LABEL: xor_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_xor_rtn_b64 v[0:1], v1, v[0:1]
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
;
-; GFX1132-LABEL: min_i32_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_brev_b32 s2, -2
-; GFX1132-NEXT: ; implicit-def: $vgpr1
-; GFX1132-NEXT: .LBB19_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_min_i32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1132-NEXT: s_cbranch_execz .LBB19_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v2
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB19_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
+; GFX8_DPP-LABEL: xor_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB20_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB20_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_xor_b32_e32 v6, s4, v6
+; GFX8_DPP-NEXT: v_xor_b32_e32 v5, s5, v5
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: xor_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB20_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB20_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_xor_b32_e32 v6, s4, v6
+; GFX9_DPP-NEXT: v_xor_b32_e32 v5, s5, v5
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: xor_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB20_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB20_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s3, v8
+; GFX1064_DPP-NEXT: v_xor_b32_e32 v7, s4, v7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: xor_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB20_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s3, v8
+; GFX1032_DPP-NEXT: v_xor_b32_e32 v7, s4, v7
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: xor_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB20_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB20_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s3, v8
+; GFX1164_DPP-NEXT: v_xor_b32_e32 v7, s4, v7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: xor_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 31
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
+; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB20_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s3, v8
+; GFX1132_DPP-NEXT: v_xor_b32_e32 v7, s4, v7
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %lane_ext = zext i32 %lane to i64
+ %old = atomicrmw xor ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
+ store i64 %old, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: max_i32_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s4, 1
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_max_i32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB21_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB21_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: max_i32_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: s_brev_b32 s4, 1
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX8_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_max_i32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB21_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: max_i32_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: s_brev_b32 s4, 1
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX9_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_max_i32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX9_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB21_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: max_i32_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_brev_b32 s4, 1
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1064_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_max_i32 s4, s4, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB21_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: max_i32_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_brev_b32 s2, 1
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1032_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032_ITERATIVE-NEXT: s_max_i32 s2, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB21_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: max_i32_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_brev_b32 s4, 1
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1164_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_max_i32 s4, s4, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB21_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: max_i32_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_brev_b32 s2, 1
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1132_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132_ITERATIVE-NEXT: s_max_i32 s2, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB21_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: max_i32_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_max_rtn_i32 v0, v1, v0
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: max_i32_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB21_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB21_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_max_i32_e32 v0, s4, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: max_i32_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB21_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB21_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_max_i32_e32 v0, s4, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: max_i32_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB21_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1064_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB21_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT: v_max_i32_e32 v0, s3, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: max_i32_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB21_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT: v_max_i32_e32 v0, s3, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: max_i32_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB21_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1164_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB21_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_i32_e32 v0, s3, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: max_i32_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB21_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_i32_e32 v0, s3, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
- %old = atomicrmw min ptr addrspace(3) @local_var32, i32 %lane acq_rel
+ %old = atomicrmw max ptr addrspace(3) @local_var32, i32 %lane acq_rel
store i32 %old, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
-;
-;
-; GFX7LESS-LABEL: min_i64_constant:
+define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
+; GFX7LESS-LABEL: max_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2
+; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
+; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: .LBB20_2:
+; GFX7LESS-NEXT: .LBB22_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -5167,30 +11589,30 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX8-LABEL: min_i64_constant:
+; GFX8-LABEL: max_i64_constant:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_cbranch_execz .LBB20_2
+; GFX8-NEXT: s_cbranch_execz .LBB22_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: v_mov_b32_e32 v0, 5
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
+; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB20_2:
+; GFX8-NEXT: .LBB22_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_bfrev_b32_e32 v0, -2
+; GFX8-NEXT: v_bfrev_b32_e32 v0, 1
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s4
@@ -5201,29 +11623,29 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
-; GFX9-LABEL: min_i64_constant:
+; GFX9-LABEL: max_i64_constant:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB20_2
+; GFX9-NEXT: s_cbranch_execz .LBB22_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, 5
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
+; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB20_2:
+; GFX9-NEXT: .LBB22_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_bfrev_b32_e32 v0, -2
+; GFX9-NEXT: v_bfrev_b32_e32 v0, 1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -5234,30 +11656,30 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: min_i64_constant:
+; GFX1064-LABEL: max_i64_constant:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB20_2
+; GFX1064-NEXT: s_cbranch_execz .LBB22_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, 5
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
+; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB20_2:
+; GFX1064-NEXT: .LBB22_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
@@ -5266,29 +11688,29 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: min_i64_constant:
+; GFX1032-LABEL: max_i64_constant:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB20_2
+; GFX1032-NEXT: s_cbranch_execz .LBB22_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, 5
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
+; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB20_2:
+; GFX1032-NEXT: .LBB22_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
-; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
+; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
@@ -5297,7 +11719,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: min_i64_constant:
+; GFX1164-LABEL: max_i64_constant:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5305,23 +11727,23 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_cbranch_execz .LBB20_2
+; GFX1164-NEXT: s_cbranch_execz .LBB22_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: v_mov_b32_e32 v0, 5
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
+; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB20_2:
+; GFX1164-NEXT: .LBB22_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -5332,29 +11754,29 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: min_i64_constant:
+; GFX1132-LABEL: max_i64_constant:
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1132-NEXT: s_cbranch_execz .LBB20_2
+; GFX1132-NEXT: s_cbranch_execz .LBB22_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: v_mov_b32_e32 v0, 5
; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
-; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
+; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB20_2:
+; GFX1132-NEXT: .LBB22_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
@@ -5365,278 +11787,3595 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
- %old = atomicrmw min ptr addrspace(3) @local_var64, i64 5 acq_rel
+ %old = atomicrmw max ptr addrspace(3) @local_var64, i64 5 acq_rel
store i64 %old, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
+define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: max_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s3, 1
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB23_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB23_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: max_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_brev_b32 s3, 1
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4]
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB23_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: max_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_brev_b32 s3, 1
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX9_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4]
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB23_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: max_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064_ITERATIVE-NEXT: s_brev_b32 s3, 1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB23_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: max_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_brev_b32 s3, 1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[2:3], s[6:7]
+; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4]
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB23_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: max_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: s_brev_b32 s3, 1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .p2align 6
+; GFX1164_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB23_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[1:2]
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: max_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_brev_b32 s3, 1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .p2align 6
+; GFX1132_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[2:3], s[6:7]
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4]
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB23_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: max_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_max_rtn_i64 v[0:1], v1, v[0:1]
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: max_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX8_DPP-NEXT: s_mov_b32 s2, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT: s_brev_b32 s3, 1
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s3
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB23_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB23_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: max_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX9_DPP-NEXT: s_mov_b32 s2, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT: s_brev_b32 s3, 1
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s3
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB23_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX9_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB23_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: max_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1064_DPP-NEXT: s_brev_b32 s3, 1
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s3
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB23_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB23_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[9:10]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: max_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032_DPP-NEXT: s_brev_b32 s3, 1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s3
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v2, s3, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB23_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: max_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1164_DPP-NEXT: s_brev_b32 s3, 1
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s3
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB23_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB23_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[9:10]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: max_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, 0
+; GFX1132_DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132_DPP-NEXT: s_brev_b32 s3, 1
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v4, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v2, s3, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4
+; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB23_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %lane_ext = zext i32 %lane to i64
+ %old = atomicrmw max ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
+ store i64 %old, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: min_i32_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s4, -2
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_min_i32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB24_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB24_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: min_i32_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: s_brev_b32 s4, -2
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX8_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_min_i32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB24_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: min_i32_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: s_brev_b32 s4, -2
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX9_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_min_i32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX9_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB24_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: min_i32_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_brev_b32 s4, -2
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1064_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_min_i32 s4, s4, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB24_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: min_i32_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_brev_b32 s2, -2
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1032_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032_ITERATIVE-NEXT: s_min_i32 s2, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB24_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: min_i32_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_brev_b32 s4, -2
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1164_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_min_i32 s4, s4, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB24_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: min_i32_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_brev_b32 s2, -2
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1132_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132_ITERATIVE-NEXT: s_min_i32 s2, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB24_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: min_i32_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_min_rtn_i32 v0, v1, v0
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: min_i32_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB24_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB24_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_min_i32_e32 v0, s4, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
;
+; GFX9_DPP-LABEL: min_i32_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB24_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB24_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_min_i32_e32 v0, s4, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: umax_i32_varying:
+; GFX1064_DPP-LABEL: min_i32_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, -2
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, -2
+; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB24_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1064_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB24_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT: v_min_i32_e32 v0, s3, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: min_i32_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, -2
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, -2
+; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB24_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT: v_min_i32_e32 v0, s3, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: min_i32_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, -2
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, -2
+; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB24_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1164_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB24_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_i32_e32 v0, s3, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: min_i32_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, -2
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, -2
+; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB24_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_i32_e32 v0, s3, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw min ptr addrspace(3) @local_var32, i32 %lane acq_rel
+ store i32 %old, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
+; GFX7LESS-LABEL: min_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB25_2
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
+; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: .LBB25_2:
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2
+; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
+; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX8-LABEL: umax_i32_varying:
+; GFX8-LABEL: min_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s4, 0
-; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: .LBB21_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s5
-; GFX8-NEXT: v_readlane_b32 s8, v0, s5
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: v_writelane_b32 v1, s4, m0
-; GFX8-NEXT: s_max_u32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB21_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
+; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8-NEXT: s_cbranch_execz .LBB21_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: s_cbranch_execz .LBB25_2
+; GFX8-NEXT: ; %bb.1:
+; GFX8-NEXT: v_mov_b32_e32 v0, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB21_4:
+; GFX8-NEXT: .LBB25_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_bfrev_b32_e32 v0, -2
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
-; GFX9-LABEL: umax_i32_varying:
+; GFX9-LABEL: min_i64_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: .LBB21_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s5
-; GFX9-NEXT: v_readlane_b32 s8, v0, s5
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: v_writelane_b32 v1, s4, m0
-; GFX9-NEXT: s_max_u32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB21_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
+; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execz .LBB21_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX9-NEXT: s_cbranch_execz .LBB25_2
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: v_mov_b32_e32 v0, 5
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB21_4:
+; GFX9-NEXT: .LBB25_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: v_bfrev_b32_e32 v0, -2
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: umax_i32_varying:
+; GFX1064-LABEL: min_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s4, 0
-; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: .LBB21_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064-NEXT: s_max_u32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB21_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0
+; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execz .LBB21_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s4
-; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX1064-NEXT: s_cbranch_execz .LBB25_2
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: v_mov_b32_e32 v0, 5
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB21_4:
+; GFX1064-NEXT: .LBB25_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: umax_i32_varying:
+; GFX1032-LABEL: min_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: .LBB21_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_max_u32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB21_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1032-NEXT: s_cbranch_execz .LBB21_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
-; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB25_2
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: v_mov_b32_e32 v0, 5
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB21_4:
+; GFX1032-NEXT: .LBB25_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
+; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: umax_i32_varying:
+; GFX1164-LABEL: min_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s4, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
-; GFX1164-NEXT: .LBB21_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT: s_max_u32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB21_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
+; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execz .LBB21_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB25_2
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: v_mov_b32_e32 v0, 5
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB21_4:
+; GFX1164-NEXT: .LBB25_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1
+; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: umax_i32_varying:
+; GFX1132-LABEL: min_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
-; GFX1132-NEXT: .LBB21_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_max_u32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1132-NEXT: s_cbranch_execz .LBB21_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1132-NEXT: s_cbranch_execz .LBB25_2
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: v_mov_b32_e32 v0, 5
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB21_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: .LBB25_2:
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1
+; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
+entry:
+ %old = atomicrmw min ptr addrspace(3) @local_var64, i64 5 acq_rel
+ store i64 %old, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: min_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s3, -2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB26_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB26_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: min_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_brev_b32 s3, -2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[4:5]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4]
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB26_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: min_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_brev_b32 s3, -2
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[4:5]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX9_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4]
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB26_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: min_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064_ITERATIVE-NEXT: s_brev_b32 s3, -2
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB26_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: min_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_brev_b32 s3, -2
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[2:3], s[6:7]
+; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4]
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB26_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: min_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: s_brev_b32 s3, -2
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .p2align 6
+; GFX1164_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB26_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: min_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_brev_b32 s3, -2
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .p2align 6
+; GFX1132_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[2:3], s[6:7]
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4]
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB26_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: min_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_min_rtn_i64 v[0:1], v1, v[0:1]
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: min_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_brev_b32 s3, -2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s3
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s7, v2, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s6, v1, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB26_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s6
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB26_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: min_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_brev_b32 s3, -2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s3
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s7, v2, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s6, v1, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB26_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s6
+; GFX9_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB26_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: min_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1064_DPP-NEXT: s_brev_b32 s3, -2
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s3
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s6
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 15
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s3, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s10, v4, 47
+; GFX1064_DPP-NEXT: v_readlane_b32 s11, v3, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s10, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s11, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB26_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB26_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[9:10]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: min_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: s_brev_b32 s3, -2
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s3
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s3, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s3
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s3, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s3
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB26_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: min_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1164_DPP-NEXT: s_brev_b32 s3, -2
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s3
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s6
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s3, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s3, v3, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s10, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s11, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB26_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB26_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[9:10]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: min_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, 0
+; GFX1132_DPP-NEXT: s_brev_b32 s3, -2
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, s2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s3, -1
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s3, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s3
+; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4
+; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB26_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %lane_ext = zext i32 %lane to i64
+ %old = atomicrmw min ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
+ store i64 %old, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: umax_i32_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_max_u32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB27_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB27_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: umax_i32_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX8_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_max_u32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB27_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: umax_i32_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX9_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_max_u32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX9_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB27_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: umax_i32_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1064_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_max_u32 s4, s4, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB27_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: umax_i32_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1032_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032_ITERATIVE-NEXT: s_max_u32 s2, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB27_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: umax_i32_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1164_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_max_u32 s4, s4, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB27_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: umax_i32_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1132_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132_ITERATIVE-NEXT: s_max_u32 s2, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB27_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: umax_i32_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_max_rtn_u32 v0, v1, v0
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: umax_i32_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB27_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB27_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_max_u32_e32 v0, s4, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: umax_i32_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB27_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB27_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_max_u32_e32 v0, s4, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: umax_i32_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB27_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1064_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB27_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT: v_max_u32_e32 v0, s3, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: umax_i32_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s3
+; GFX1032_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB27_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT: v_max_u32_e32 v0, s3, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: umax_i32_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB27_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1164_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB27_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_u32_e32 v0, s3, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: umax_i32_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s3
+; GFX1132_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB27_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_u32_e32 v0, s3, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw umax ptr addrspace(3) @local_var32, i32 %lane acq_rel
@@ -5645,8 +15384,6 @@ entry:
}
define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
-;
-;
; GFX7LESS-LABEL: umax_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -5654,7 +15391,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2
+; GFX7LESS-NEXT: s_cbranch_execz .LBB28_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
@@ -5662,7 +15399,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: .LBB22_2:
+; GFX7LESS-NEXT: .LBB28_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
@@ -5687,7 +15424,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_cbranch_execz .LBB22_2
+; GFX8-NEXT: s_cbranch_execz .LBB28_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: v_mov_b32_e32 v0, 5
; GFX8-NEXT: v_mov_b32_e32 v1, 0
@@ -5695,7 +15432,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB22_2:
+; GFX8-NEXT: .LBB28_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
@@ -5720,14 +15457,14 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB22_2
+; GFX9-NEXT: s_cbranch_execz .LBB28_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, 5
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB22_2:
+; GFX9-NEXT: .LBB28_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
@@ -5752,7 +15489,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB22_2
+; GFX1064-NEXT: s_cbranch_execz .LBB28_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, 5
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
@@ -5760,7 +15497,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB22_2:
+; GFX1064-NEXT: .LBB28_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -5783,7 +15520,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB22_2
+; GFX1032-NEXT: s_cbranch_execz .LBB28_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, 5
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
@@ -5791,7 +15528,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB22_2:
+; GFX1032-NEXT: .LBB28_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -5816,7 +15553,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_cbranch_execz .LBB22_2
+; GFX1164-NEXT: s_cbranch_execz .LBB28_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: v_mov_b32_e32 v0, 5
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
@@ -5824,7 +15561,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB22_2:
+; GFX1164-NEXT: .LBB28_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
@@ -5850,14 +15587,14 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1132-NEXT: s_cbranch_execz .LBB22_2
+; GFX1132-NEXT: s_cbranch_execz .LBB28_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: v_mov_b32_e32 v0, 5
; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB22_2:
+; GFX1132-NEXT: .LBB28_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
@@ -5881,273 +15618,1665 @@ entry:
ret void
}
+define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: umax_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB29_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB29_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: umax_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4]
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB29_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: umax_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX9_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4]
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB29_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: umax_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB29_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: umax_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[2:3], s[6:7]
+; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB29_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: umax_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .p2align 6
+; GFX1164_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB29_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[1:2]
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: umax_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .p2align 6
+; GFX1132_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[2:3], s[6:7]
+; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB29_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: umax_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_max_rtn_u64 v[0:1], v1, v[0:1]
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: umax_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB29_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB29_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: umax_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB29_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX9_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB29_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: umax_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB29_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB29_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[9:10]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: umax_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v2, s3, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB29_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: umax_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB29_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB29_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[9:10]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: umax_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v4, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v2, s3, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4
+; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB29_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %lane_ext = zext i32 %lane to i64
+ %old = atomicrmw umax ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
+ store i64 %old, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: umin_i32_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, -1
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX7LESS_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_min_u32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB30_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB30_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
;
+; GFX8_ITERATIVE-LABEL: umin_i32_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, -1
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX8_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_min_u32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB30_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
;
-; GFX7LESS-LABEL: umin_i32_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7LESS-NEXT: s_endpgm
+; GFX9_ITERATIVE-LABEL: umin_i32_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, -1
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX9_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_min_u32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX9_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB30_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
;
-; GFX8-LABEL: umin_i32_varying:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s4, -1
-; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: .LBB23_1: ; %ComputeLoop
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8-NEXT: s_mov_b32 m0, s5
-; GFX8-NEXT: v_readlane_b32 s8, v0, s5
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: v_writelane_b32 v1, s4, m0
-; GFX8-NEXT: s_min_u32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8-NEXT: s_cbranch_scc1 .LBB23_1
-; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8-NEXT: s_cbranch_execz .LBB23_4
-; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB23_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_min_u32_e32 v0, s4, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
+; GFX1064_ITERATIVE-LABEL: umin_i32_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, -1
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1064_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_min_u32 s4, s4, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB30_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
;
-; GFX9-LABEL: umin_i32_varying:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s4, -1
-; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: .LBB23_1: ; %ComputeLoop
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: s_mov_b32 m0, s5
-; GFX9-NEXT: v_readlane_b32 s8, v0, s5
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: v_writelane_b32 v1, s4, m0
-; GFX9-NEXT: s_min_u32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB23_1
-; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execz .LBB23_4
-; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB23_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_min_u32_e32 v0, s4, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT: s_endpgm
+; GFX1032_ITERATIVE-LABEL: umin_i32_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1032_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032_ITERATIVE-NEXT: s_min_u32 s2, s2, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB30_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
;
-; GFX1064-LABEL: umin_i32_varying:
-; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s4, -1
-; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: .LBB23_1: ; %ComputeLoop
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064-NEXT: s_min_u32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064-NEXT: s_cbranch_scc1 .LBB23_1
-; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execz .LBB23_4
-; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s4
-; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v2
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB23_4:
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1064-NEXT: s_endpgm
+; GFX1164_ITERATIVE-LABEL: umin_i32_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, -1
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1164_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_min_u32 s4, s4, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB30_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
;
-; GFX1032-LABEL: umin_i32_varying:
-; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: .LBB23_1: ; %ComputeLoop
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_min_u32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032-NEXT: s_cbranch_scc1 .LBB23_1
-; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1032-NEXT: s_cbranch_execz .LBB23_4
-; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
-; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB23_4:
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX1032-NEXT: s_endpgm
+; GFX1132_ITERATIVE-LABEL: umin_i32_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1
+; GFX1132_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132_ITERATIVE-NEXT: s_min_u32 s2, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0
+; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2
+; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB30_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
;
-; GFX1164-LABEL: umin_i32_varying:
-; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s4, -1
-; GFX1164-NEXT: ; implicit-def: $vgpr1
-; GFX1164-NEXT: .LBB23_1: ; %ComputeLoop
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
-; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT: s_min_u32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164-NEXT: s_cbranch_scc1 .LBB23_1
-; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execz .LBB23_4
-; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v2
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB23_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1164-NEXT: s_endpgm
+; GFX7LESS_DPP-LABEL: umin_i32_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_min_rtn_u32 v0, v1, v0
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
;
-; GFX1132-LABEL: umin_i32_varying:
-; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: ; implicit-def: $vgpr1
-; GFX1132-NEXT: .LBB23_1: ; %ComputeLoop
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_min_u32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1
-; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
-; GFX1132-NEXT: s_cbranch_execz .LBB23_4
-; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v2
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB23_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX1132-NEXT: s_endpgm
+; GFX8_DPP-LABEL: umin_i32_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: s_nop 1
+; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB30_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB30_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_min_u32_e32 v0, s4, v0
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: umin_i32_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: s_nop 1
+; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB30_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB30_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_min_u32_e32 v0, s4, v0
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: umin_i32_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB30_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7
+; GFX1064_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1064_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB30_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT: v_min_u32_e32 v0, s3, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: umin_i32_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB30_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT: v_min_u32_e32 v0, s3, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: umin_i32_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB30_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7
+; GFX1164_DPP-NEXT: s_mov_b32 s3, s7
+; GFX1164_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB30_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_u32_e32 v0, s3, v0
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: umin_i32_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB30_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_u32_e32 v0, s3, v0
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw umin ptr addrspace(3) @local_var32, i32 %lane acq_rel
@@ -6156,8 +17285,6 @@ entry:
}
define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
-;
-;
; GFX7LESS-LABEL: umin_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -6165,7 +17292,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2
+; GFX7LESS-NEXT: s_cbranch_execz .LBB31_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
@@ -6173,7 +17300,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: .LBB24_2:
+; GFX7LESS-NEXT: .LBB31_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
@@ -6198,7 +17325,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_cbranch_execz .LBB24_2
+; GFX8-NEXT: s_cbranch_execz .LBB31_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: v_mov_b32_e32 v0, 5
; GFX8-NEXT: v_mov_b32_e32 v1, 0
@@ -6206,7 +17333,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: .LBB24_2:
+; GFX8-NEXT: .LBB31_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
@@ -6231,14 +17358,14 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB24_2
+; GFX9-NEXT: s_cbranch_execz .LBB31_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, 5
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: .LBB24_2:
+; GFX9-NEXT: .LBB31_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
@@ -6263,7 +17390,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB24_2
+; GFX1064-NEXT: s_cbranch_execz .LBB31_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, 5
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
@@ -6271,7 +17398,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: .LBB24_2:
+; GFX1064-NEXT: .LBB31_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -6294,7 +17421,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB24_2
+; GFX1032-NEXT: s_cbranch_execz .LBB31_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, 5
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
@@ -6302,7 +17429,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: .LBB24_2:
+; GFX1032-NEXT: .LBB31_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -6327,7 +17454,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1164-NEXT: s_cbranch_execz .LBB24_2
+; GFX1164-NEXT: s_cbranch_execz .LBB31_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: v_mov_b32_e32 v0, 5
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
@@ -6335,7 +17462,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
-; GFX1164-NEXT: .LBB24_2:
+; GFX1164-NEXT: .LBB31_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
@@ -6361,14 +17488,14 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1132-NEXT: s_cbranch_execz .LBB24_2
+; GFX1132-NEXT: s_cbranch_execz .LBB31_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: v_mov_b32_e32 v0, 5
; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
-; GFX1132-NEXT: .LBB24_2:
+; GFX1132-NEXT: .LBB31_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
@@ -6391,3 +17518,1033 @@ entry:
store i64 %old, ptr addrspace(1) %out
ret void
}
+
+define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
+; GFX7LESS_ITERATIVE-LABEL: umin_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB32_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: .LBB32_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: umin_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[4:5]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
+; GFX8_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4]
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: .LBB32_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: umin_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[4:5]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
+; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX9_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4]
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: .LBB32_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
+; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: umin_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB32_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: umin_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[6:7]
+; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB32_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: umin_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .p2align 6
+; GFX1164_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB32_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: umin_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .p2align 6
+; GFX1132_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[6:7]
+; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7
+; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB32_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: umin_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: ds_min_rtn_u64 v[0:1], v1, v[0:1]
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: umin_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB32_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT: s_mov_b32 m0, -1
+; GFX8_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: .LBB32_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: umin_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB32_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX9_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: .LBB32_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
+;
+; GFX1064_DPP-LABEL: umin_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB32_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB32_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[9:10]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: umin_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v2, s3, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB32_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: umin_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB32_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s4
+; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB32_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[9:10]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: umin_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_readlane_b32 s3, v4, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v2, s3, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4
+; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB32_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v12
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %lane_ext = zext i32 %lane to i64
+ %old = atomicrmw umin ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
+ store i64 %old, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index ca94d68f01917..6be4b893c14a4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -570,13 +570,44 @@ entry:
define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
; GFX6-LABEL: add_i32_varying_vdata:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b64 s[2:3], exec
+; GFX6-NEXT: s_mov_b32 s4, 0
+; GFX6-NEXT: ; implicit-def: $vgpr1
+; GFX6-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX6-NEXT: s_mov_b32 m0, s5
+; GFX6-NEXT: v_readlane_b32 s8, v0, s5
+; GFX6-NEXT: v_writelane_b32 v1, s4, m0
+; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX6-NEXT: s_add_i32 s4, s4, s8
+; GFX6-NEXT: s_cbranch_vccnz .LBB2_1
+; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX6-NEXT: ; implicit-def: $vgpr0
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX6-NEXT: s_cbranch_execz .LBB2_4
+; GFX6-NEXT: ; %bb.3:
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX6-NEXT: .LBB2_4:
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_readfirstlane_b32 s4, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
@@ -1575,13 +1606,44 @@ entry:
define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
; GFX6-LABEL: sub_i32_varying_vdata:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b64 s[2:3], exec
+; GFX6-NEXT: s_mov_b32 s4, 0
+; GFX6-NEXT: ; implicit-def: $vgpr1
+; GFX6-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX6-NEXT: s_mov_b32 m0, s5
+; GFX6-NEXT: v_readlane_b32 s8, v0, s5
+; GFX6-NEXT: v_writelane_b32 v1, s4, m0
+; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX6-NEXT: s_add_i32 s4, s4, s8
+; GFX6-NEXT: s_cbranch_vccnz .LBB6_1
+; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX6-NEXT: ; implicit-def: $vgpr0
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX6-NEXT: s_cbranch_execz .LBB6_4
+; GFX6-NEXT: ; %bb.3:
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
+; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX6-NEXT: .LBB6_4:
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_readfirstlane_b32 s4, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 7e15c07f95269..419d8bc5cc747 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -586,14 +586,45 @@ entry:
define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
; GFX6-LABEL: add_i32_varying_vdata:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_mov_b64 s[2:3], exec
+; GFX6-NEXT: s_mov_b32 s4, 0
+; GFX6-NEXT: ; implicit-def: $vgpr1
+; GFX6-NEXT: .LBB2_1: ; %ComputeLoop
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX6-NEXT: s_mov_b32 m0, s5
+; GFX6-NEXT: v_readlane_b32 s8, v0, s5
+; GFX6-NEXT: v_writelane_b32 v1, s4, m0
+; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX6-NEXT: s_add_i32 s4, s4, s8
+; GFX6-NEXT: s_cbranch_vccnz .LBB2_1
+; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX6-NEXT: ; implicit-def: $vgpr0
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX6-NEXT: s_cbranch_execz .LBB2_4
+; GFX6-NEXT: ; %bb.3:
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
+; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
+; GFX6-NEXT: .LBB2_4:
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_readfirstlane_b32 s4, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
@@ -1749,14 +1780,45 @@ entry:
define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
; GFX6-LABEL: sub_i32_varying_vdata:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_mov_b64 s[2:3], exec
+; GFX6-NEXT: s_mov_b32 s4, 0
+; GFX6-NEXT: ; implicit-def: $vgpr1
+; GFX6-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX6-NEXT: s_mov_b32 m0, s5
+; GFX6-NEXT: v_readlane_b32 s8, v0, s5
+; GFX6-NEXT: v_writelane_b32 v1, s4, m0
+; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
+; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX6-NEXT: s_add_i32 s4, s4, s8
+; GFX6-NEXT: s_cbranch_vccnz .LBB7_1
+; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX6-NEXT: ; implicit-def: $vgpr0
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX6-NEXT: s_cbranch_execz .LBB7_4
+; GFX6-NEXT: ; %bb.3:
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 idxen glc
+; GFX6-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
+; GFX6-NEXT: .LBB7_4:
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_readfirstlane_b32 s4, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index b6990c8b842fd..c1f21e2ed1477 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -5,7 +5,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
@@ -191,6 +191,42 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX7LESS-DPP-NEXT: .LBB0_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
@@ -337,19 +373,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -361,27 +395,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4
+; GFX7LESS-NEXT: .LBB1_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -676,6 +734,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB1_4:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -1305,6 +1413,52 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX7LESS-DPP-NEXT: .LBB2_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -1517,19 +1671,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -1541,27 +1693,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4
+; GFX7LESS-NEXT: .LBB3_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -1882,6 +2058,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-NEXT: .LBB3_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2537,6 +2763,52 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX7LESS-DPP-NEXT: .LBB4_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -2749,19 +3021,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -2773,27 +3043,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4
+; GFX7LESS-NEXT: .LBB5_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -3088,6 +3382,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB5_4:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -3471,19 +3815,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -3495,27 +3837,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX7LESS-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB6_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB6_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4
+; GFX7LESS-NEXT: .LBB6_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -3810,6 +4176,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB6_4:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4439,6 +4855,52 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX7LESS-DPP-NEXT: .LBB7_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -4650,19 +5112,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -4674,27 +5134,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX7LESS-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB8_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB8_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4
+; GFX7LESS-NEXT: .LBB8_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
@@ -5015,6 +5499,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1132-NEXT: .LBB8_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -5634,6 +6168,46 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-NEXT: .LBB9_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX7LESS-DPP-NEXT: .LBB9_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
@@ -5815,19 +6389,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -5839,30 +6411,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7LESS-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB10_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4
+; GFX7LESS-NEXT: .LBB10_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6196,6 +6794,59 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: .LBB10_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -6933,6 +7584,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-NEXT: .LBB11_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX7LESS-DPP-NEXT: .LBB11_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -7148,19 +7848,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7172,30 +7870,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7LESS-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB12_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB12_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4
+; GFX7LESS-NEXT: .LBB12_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7529,6 +8253,59 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-NEXT: .LBB12_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -8266,6 +9043,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX7LESS-DPP-NEXT: .LBB13_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -8481,19 +9307,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -8505,30 +9329,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7LESS-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB14_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB14_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4
+; GFX7LESS-NEXT: .LBB14_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
@@ -8862,6 +9712,59 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: .LBB14_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -9345,19 +10248,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -9369,30 +10270,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7LESS-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB15_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB15_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4
+; GFX7LESS-NEXT: .LBB15_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -9726,6 +10653,59 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: .LBB15_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -10463,6 +11443,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-NEXT: .LBB16_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX7LESS-DPP-NEXT: .LBB16_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -10678,19 +11707,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -10702,30 +11729,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7LESS-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB17_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4
+; GFX7LESS-NEXT: .LBB17_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
@@ -11059,6 +12112,59 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-NEXT: .LBB17_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -11740,6 +12846,42 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
; GFX1132-NEXT: .LBB18_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB18_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB18_2
+; GFX7LESS-DPP-NEXT: .LBB18_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
@@ -12110,6 +13252,42 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
; GFX1132-NEXT: .LBB19_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB19_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB19_2
+; GFX7LESS-DPP-NEXT: .LBB19_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index f512f17bbbcbf..54722638a9455 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -5,7 +5,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
@@ -145,6 +145,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX7LESS-DPP-NEXT: .LBB0_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -248,19 +281,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -272,14 +303,40 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
@@ -287,14 +344,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4
+; GFX7LESS-NEXT: .LBB1_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -577,6 +634,58 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB1_4:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -1153,6 +1262,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX7LESS-DPP-NEXT: .LBB2_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1310,19 +1452,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -1334,14 +1474,40 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
@@ -1349,14 +1515,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4
+; GFX7LESS-NEXT: .LBB3_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -1698,6 +1864,58 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-NEXT: .LBB3_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2335,6 +2553,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX7LESS-DPP-NEXT: .LBB4_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2491,19 +2742,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -2515,14 +2764,40 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
@@ -2530,14 +2805,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4
+; GFX7LESS-NEXT: .LBB5_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -2879,6 +3154,58 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-NEXT: .LBB5_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -3499,6 +3826,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-NEXT: .LBB6_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX7LESS-DPP-NEXT: .LBB6_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3635,19 +3999,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -3659,14 +4021,42 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3676,15 +4066,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4
+; GFX7LESS-NEXT: .LBB7_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -4012,6 +4402,61 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: .LBB7_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4707,6 +5152,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1132-NEXT: .LBB8_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX7LESS-DPP-NEXT: .LBB8_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4871,19 +5353,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -4895,14 +5375,42 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB9_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4912,15 +5420,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4
+; GFX7LESS-NEXT: .LBB9_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -5278,6 +5786,61 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-NEXT: .LBB9_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -6003,6 +6566,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-NEXT: .LBB10_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX7LESS-DPP-NEXT: .LBB10_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6167,19 +6767,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -6191,14 +6789,42 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -6208,15 +6834,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4
+; GFX7LESS-NEXT: .LBB11_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
@@ -6574,6 +7200,61 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: .LBB11_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -7287,6 +7968,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-NEXT: .LBB12_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX7LESS-DPP-NEXT: .LBB12_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7622,6 +8336,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX7LESS-DPP-NEXT: .LBB13_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index c3b3079db3adc..981068f5120bf 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -5,7 +5,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
@@ -145,6 +145,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX7LESS-DPP-NEXT: .LBB0_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -248,19 +281,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -272,14 +303,40 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
@@ -287,14 +344,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4
+; GFX7LESS-NEXT: .LBB1_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -577,6 +634,58 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB1_4:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -1153,6 +1262,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX7LESS-DPP-NEXT: .LBB2_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1310,19 +1452,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -1334,14 +1474,40 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
@@ -1349,14 +1515,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4
+; GFX7LESS-NEXT: .LBB3_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -1698,6 +1864,58 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-NEXT: .LBB3_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2335,6 +2553,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX7LESS-DPP-NEXT: .LBB4_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2491,19 +2742,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -2515,14 +2764,40 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
@@ -2530,14 +2805,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4
+; GFX7LESS-NEXT: .LBB5_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -2879,6 +3154,58 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-NEXT: .LBB5_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -3499,6 +3826,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-NEXT: .LBB6_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX7LESS-DPP-NEXT: .LBB6_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3635,19 +3999,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -3659,14 +4021,42 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3676,15 +4066,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4
+; GFX7LESS-NEXT: .LBB7_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -4012,6 +4402,61 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: .LBB7_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4707,6 +5152,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1132-NEXT: .LBB8_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX7LESS-DPP-NEXT: .LBB8_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4871,19 +5353,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -4895,14 +5375,42 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB9_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4912,15 +5420,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4
+; GFX7LESS-NEXT: .LBB9_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -5278,6 +5786,61 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-NEXT: .LBB9_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -6003,6 +6566,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-NEXT: .LBB10_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX7LESS-DPP-NEXT: .LBB10_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6167,19 +6767,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -6191,14 +6789,42 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -6208,15 +6834,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4
+; GFX7LESS-NEXT: .LBB11_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
@@ -6574,6 +7200,61 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: .LBB11_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -7287,6 +7968,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-NEXT: .LBB12_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX7LESS-DPP-NEXT: .LBB12_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7622,6 +8336,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX7LESS-DPP-NEXT: .LBB13_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 8664fdf242036..b1363246c7d13 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -5,7 +5,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
@@ -217,6 +217,42 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX7LESS-DPP-NEXT: .LBB0_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
@@ -389,19 +425,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -413,27 +447,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4
+; GFX7LESS-NEXT: .LBB1_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
@@ -754,6 +812,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB1_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -1409,6 +1517,52 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX7LESS-DPP-NEXT: .LBB2_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -1621,19 +1775,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -1645,27 +1797,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4
+; GFX7LESS-NEXT: .LBB3_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -1986,6 +2162,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-NEXT: .LBB3_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2641,6 +2867,52 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX7LESS-DPP-NEXT: .LBB4_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -2853,19 +3125,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -2877,27 +3147,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4
+; GFX7LESS-NEXT: .LBB5_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
@@ -3218,6 +3512,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB5_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -3627,19 +3971,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -3651,27 +3993,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX7LESS-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB6_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB6_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4
+; GFX7LESS-NEXT: .LBB6_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -3992,6 +4358,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB6_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4647,6 +5063,52 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX7LESS-DPP-NEXT: .LBB7_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -4858,19 +5320,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -4882,27 +5342,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX7LESS-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
+; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB8_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB8_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4
+; GFX7LESS-NEXT: .LBB8_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp:
@@ -5223,6 +5707,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1132-NEXT: .LBB8_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -5842,6 +6376,46 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-NEXT: .LBB9_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX7LESS-DPP-NEXT: .LBB9_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
@@ -6023,19 +6597,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -6047,30 +6619,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7LESS-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB10_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4
+; GFX7LESS-NEXT: .LBB10_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6404,6 +7002,59 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: .LBB10_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -7141,6 +7792,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-NEXT: .LBB11_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX7LESS-DPP-NEXT: .LBB11_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -7355,19 +8055,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7379,30 +8077,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7LESS-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB12_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB12_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4
+; GFX7LESS-NEXT: .LBB12_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7736,6 +8460,59 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-NEXT: .LBB12_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -8473,6 +9250,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX7LESS-DPP-NEXT: .LBB13_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -8688,19 +9514,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -8712,30 +9536,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7LESS-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB14_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB14_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4
+; GFX7LESS-NEXT: .LBB14_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
@@ -9069,6 +9919,59 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: .LBB14_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -9552,19 +10455,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -9576,30 +10477,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7LESS-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB15_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB15_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4
+; GFX7LESS-NEXT: .LBB15_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -9933,6 +10860,59 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: .LBB15_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -10669,6 +11649,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-NEXT: .LBB16_3:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX7LESS-DPP-NEXT: ; %bb.1:
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX7LESS-DPP-NEXT: .LBB16_3:
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -10884,19 +11913,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
+; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
; GFX7LESS-NEXT: s_mov_b32 s14, s8
; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s38, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -10908,30 +11935,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX7LESS-NEXT: s_mov_b32 s12, s6
; GFX7LESS-NEXT: s_mov_b32 s13, s7
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7LESS-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
+; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
+; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
+; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX7LESS-NEXT: s_cbranch_vccnz .LBB17_1
+; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5
+; GFX7LESS-NEXT: ; %bb.3:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4
+; GFX7LESS-NEXT: .LBB17_5:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
@@ -11265,6 +12318,59 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-NEXT: .LBB17_5:
; GFX1132-NEXT: s_endpgm
;
+; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
+; GFX7LESS-DPP: ; %bb.0:
+; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 99b67a278a027..08e4d11d4b311 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -7586,9 +7586,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB28_2
-; GFX7-NEXT: ; %bb.3: ; %Flow18
+; GFX7-NEXT: ; %bb.3: ; %Flow22
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: .LBB28_4: ; %Flow19
+; GFX7-NEXT: .LBB28_4: ; %Flow23
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: v_readfirstlane_b32 s8, v1
@@ -7616,32 +7616,62 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB28_6
-; GFX7-NEXT: .LBB28_7: ; %Flow17
+; GFX7-NEXT: .LBB28_7: ; %Flow21
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: ds_read_b32 v1, v2
; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
-; GFX7-NEXT: v_add_f32_e32 v0, s8, v0
+; GFX7-NEXT: s_mov_b64 s[4:5], exec
+; GFX7-NEXT: v_add_f32_e32 v2, s8, v0
+; GFX7-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX7-NEXT: ; implicit-def: $vgpr0
+; GFX7-NEXT: .LBB28_8: ; %ComputeLoop
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_ff1_i32_b64 s3, s[4:5]
+; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_readlane_b32 s9, v2, s3
+; GFX7-NEXT: s_mov_b32 m0, s3
+; GFX7-NEXT: v_writelane_b32 v0, s8, m0
+; GFX7-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX7-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_vccnz .LBB28_8
+; GFX7-NEXT: ; %bb.9: ; %ComputeEnd
+; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0
+; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7-NEXT: ; implicit-def: $vgpr2
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execz .LBB28_13
+; GFX7-NEXT: ; %bb.10:
+; GFX7-NEXT: v_mov_b32_e32 v3, s2
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_read_b32 v2, v3
; GFX7-NEXT: s_mov_b64 s[2:3], 0
-; GFX7-NEXT: .LBB28_8: ; %atomicrmw.start8
+; GFX7-NEXT: .LBB28_11: ; %atomicrmw.start8
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_add_f32_e32 v1, v3, v0
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v3, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v4, v1
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX7-NEXT: s_cbranch_execnz .LBB28_8
-; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7
+; GFX7-NEXT: s_cbranch_execnz .LBB28_11
+; GFX7-NEXT: ; %bb.12: ; %Flow
; GFX7-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7-NEXT: .LBB28_13: ; %Flow19
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX7-NEXT: v_readfirstlane_b32 s4, v2
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: v_add_f32_e32 v0, s4, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX6-LABEL: local_ds_fadd:
@@ -7676,9 +7706,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB28_2
-; GFX6-NEXT: ; %bb.3: ; %Flow16
+; GFX6-NEXT: ; %bb.3: ; %Flow20
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: .LBB28_4: ; %Flow17
+; GFX6-NEXT: .LBB28_4: ; %Flow21
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: v_readfirstlane_b32 s8, v1
@@ -7706,32 +7736,62 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB28_6
-; GFX6-NEXT: .LBB28_7: ; %Flow15
+; GFX6-NEXT: .LBB28_7: ; %Flow19
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
-; GFX6-NEXT: ds_read_b32 v1, v2
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
-; GFX6-NEXT: v_add_f32_e32 v0, s8, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: v_add_f32_e32 v2, s8, v0
+; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX6-NEXT: ; implicit-def: $vgpr0
+; GFX6-NEXT: .LBB28_8: ; %ComputeLoop
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_ff1_i32_b64 s3, s[4:5]
+; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX6-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readlane_b32 s9, v2, s3
+; GFX6-NEXT: s_mov_b32 m0, s3
+; GFX6-NEXT: v_writelane_b32 v0, s8, m0
+; GFX6-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_vccnz .LBB28_8
+; GFX6-NEXT: ; %bb.9: ; %ComputeEnd
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX6-NEXT: ; implicit-def: $vgpr2
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execz .LBB28_13
+; GFX6-NEXT: ; %bb.10:
+; GFX6-NEXT: v_mov_b32_e32 v3, s2
+; GFX6-NEXT: s_mov_b32 m0, -1
+; GFX6-NEXT: ds_read_b32 v2, v3
; GFX6-NEXT: s_mov_b64 s[2:3], 0
-; GFX6-NEXT: .LBB28_8: ; %atomicrmw.start8
+; GFX6-NEXT: .LBB28_11: ; %atomicrmw.start8
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_add_f32_e32 v1, v3, v0
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v3, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_add_f32_e32 v2, v4, v1
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
; GFX6-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX6-NEXT: s_cbranch_execnz .LBB28_8
-; GFX6-NEXT: ; %bb.9: ; %atomicrmw.end7
+; GFX6-NEXT: s_cbranch_execnz .LBB28_11
+; GFX6-NEXT: ; %bb.12: ; %Flow
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: .LBB28_13: ; %Flow17
+; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX6-NEXT: v_readfirstlane_b32 s4, v2
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_add_f32_e32 v0, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
@@ -8332,9 +8392,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB29_2
-; GFX7-NEXT: ; %bb.3: ; %Flow18
+; GFX7-NEXT: ; %bb.3: ; %Flow22
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: .LBB29_4: ; %Flow19
+; GFX7-NEXT: .LBB29_4: ; %Flow23
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: v_readfirstlane_b32 s8, v1
@@ -8362,32 +8422,62 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB29_6
-; GFX7-NEXT: .LBB29_7: ; %Flow17
+; GFX7-NEXT: .LBB29_7: ; %Flow21
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: ds_read_b32 v1, v2
; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
-; GFX7-NEXT: v_add_f32_e32 v0, s8, v0
+; GFX7-NEXT: s_mov_b64 s[4:5], exec
+; GFX7-NEXT: v_add_f32_e32 v2, s8, v0
+; GFX7-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX7-NEXT: ; implicit-def: $vgpr0
+; GFX7-NEXT: .LBB29_8: ; %ComputeLoop
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_ff1_i32_b64 s3, s[4:5]
+; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_readlane_b32 s9, v2, s3
+; GFX7-NEXT: s_mov_b32 m0, s3
+; GFX7-NEXT: v_writelane_b32 v0, s8, m0
+; GFX7-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX7-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_vccnz .LBB29_8
+; GFX7-NEXT: ; %bb.9: ; %ComputeEnd
+; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0
+; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7-NEXT: ; implicit-def: $vgpr2
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execz .LBB29_13
+; GFX7-NEXT: ; %bb.10:
+; GFX7-NEXT: v_mov_b32_e32 v3, s2
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_read_b32 v2, v3
; GFX7-NEXT: s_mov_b64 s[2:3], 0
-; GFX7-NEXT: .LBB29_8: ; %atomicrmw.start8
+; GFX7-NEXT: .LBB29_11: ; %atomicrmw.start8
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_add_f32_e32 v1, v3, v0
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v3, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v4, v1
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX7-NEXT: s_cbranch_execnz .LBB29_8
-; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7
+; GFX7-NEXT: s_cbranch_execnz .LBB29_11
+; GFX7-NEXT: ; %bb.12: ; %Flow
; GFX7-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7-NEXT: .LBB29_13: ; %Flow19
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX7-NEXT: v_readfirstlane_b32 s4, v2
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: v_add_f32_e32 v0, s4, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX6-LABEL: local_ds_fadd_one_as:
@@ -8422,9 +8512,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB29_2
-; GFX6-NEXT: ; %bb.3: ; %Flow16
+; GFX6-NEXT: ; %bb.3: ; %Flow20
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: .LBB29_4: ; %Flow17
+; GFX6-NEXT: .LBB29_4: ; %Flow21
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: v_readfirstlane_b32 s8, v1
@@ -8452,32 +8542,62 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB29_6
-; GFX6-NEXT: .LBB29_7: ; %Flow15
+; GFX6-NEXT: .LBB29_7: ; %Flow19
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
-; GFX6-NEXT: ds_read_b32 v1, v2
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
-; GFX6-NEXT: v_add_f32_e32 v0, s8, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: v_add_f32_e32 v2, s8, v0
+; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX6-NEXT: ; implicit-def: $vgpr0
+; GFX6-NEXT: .LBB29_8: ; %ComputeLoop
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_ff1_i32_b64 s3, s[4:5]
+; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX6-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readlane_b32 s9, v2, s3
+; GFX6-NEXT: s_mov_b32 m0, s3
+; GFX6-NEXT: v_writelane_b32 v0, s8, m0
+; GFX6-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_vccnz .LBB29_8
+; GFX6-NEXT: ; %bb.9: ; %ComputeEnd
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX6-NEXT: ; implicit-def: $vgpr2
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execz .LBB29_13
+; GFX6-NEXT: ; %bb.10:
+; GFX6-NEXT: v_mov_b32_e32 v3, s2
+; GFX6-NEXT: s_mov_b32 m0, -1
+; GFX6-NEXT: ds_read_b32 v2, v3
; GFX6-NEXT: s_mov_b64 s[2:3], 0
-; GFX6-NEXT: .LBB29_8: ; %atomicrmw.start8
+; GFX6-NEXT: .LBB29_11: ; %atomicrmw.start8
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_add_f32_e32 v1, v3, v0
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v3, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_add_f32_e32 v2, v4, v1
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
; GFX6-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX6-NEXT: s_cbranch_execnz .LBB29_8
-; GFX6-NEXT: ; %bb.9: ; %atomicrmw.end7
+; GFX6-NEXT: s_cbranch_execnz .LBB29_11
+; GFX6-NEXT: ; %bb.12: ; %Flow
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: .LBB29_13: ; %Flow17
+; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX6-NEXT: v_readfirstlane_b32 s4, v2
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_add_f32_e32 v0, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
More information about the llvm-branch-commits
mailing list