[llvm] [AMDGPU] Occupancy w.r.t. workgroup size range is also a range (PR #123748)
Lucas Ramirez via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 23 05:45:45 PST 2025
https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/123748
>From c3bf55b6369f8399bc0da2d509a03b63f928798d Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 12 Dec 2024 13:41:40 +0100
Subject: [PATCH 1/5] [AMDGPU] Occupancy w.r.t. WG size is now a range
All unit tests updated.
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 4 +-
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +-
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 125 +-
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 18 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 9 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 20 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 16 +-
.../Target/AMDGPU/SIMachineFunctionInfo.cpp | 5 +-
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 9 +-
.../CodeGen/AMDGPU/GlobalISel/add.vni16.ll | 140 +-
.../CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll | 336 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 434 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 406 +-
.../AMDGPU/GlobalISel/insertelement.ll | 30 +-
.../CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll | 275 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 384 +-
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 192 +-
.../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 998 +--
.../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 395 +-
.../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 1573 ++---
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 116 +-
.../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 1284 ++--
.../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 463 +-
.../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 1374 ++--
.../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 44 +-
llvm/test/CodeGen/AMDGPU/abs_i16.ll | 174 +-
llvm/test/CodeGen/AMDGPU/add.ll | 64 +-
llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 460 +-
.../AMDGPU/agpr-copy-no-free-registers.ll | 80 +-
.../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll | 4 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 6164 ++++++++---------
.../test/CodeGen/AMDGPU/branch-relax-spill.ll | 8 +-
.../CodeGen/AMDGPU/calling-conventions.ll | 6 +-
.../AMDGPU/dbg-value-ends-sched-region.mir | 32 +-
.../AMDGPU/debug-value-scheduler-crash.mir | 38 +-
llvm/test/CodeGen/AMDGPU/div_i128.ll | 212 +-
llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 1995 +++---
.../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 53 +-
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 649 +-
llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 560 +-
llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll | 168 +-
llvm/test/CodeGen/AMDGPU/function-args.ll | 1732 ++---
llvm/test/CodeGen/AMDGPU/function-returns.ll | 28 +-
.../AMDGPU/gfx-callable-argument-types.ll | 12 +-
.../AMDGPU/gfx-callable-return-types.ll | 190 +-
llvm/test/CodeGen/AMDGPU/half.ll | 509 +-
llvm/test/CodeGen/AMDGPU/idot8s.ll | 6 +-
.../CodeGen/AMDGPU/indirect-addressing-si.ll | 966 +--
.../AMDGPU/insert_vector_elt.v2bf16.ll | 246 +-
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 224 +-
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 84 +-
llvm/test/CodeGen/AMDGPU/licm-regpressure.mir | 16 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 324 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 198 +-
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 150 +-
llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 198 +-
llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 97 +-
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 1690 +++--
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 2106 +++---
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 819 ++-
llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 95 +-
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 1472 ++--
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 2 +-
llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 2 +-
.../machine-scheduler-sink-trivial-remats.mir | 160 +-
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 4 +-
llvm/test/CodeGen/AMDGPU/memory_clause.mir | 2 +-
.../AMDGPU/min-waves-per-eu-not-respected.ll | 2 +-
llvm/test/CodeGen/AMDGPU/mul.ll | 101 +-
.../CodeGen/AMDGPU/mul24-pass-ordering.ll | 16 +-
llvm/test/CodeGen/AMDGPU/permute_i8.ll | 148 +-
llvm/test/CodeGen/AMDGPU/pr51516.mir | 4 +-
.../AMDGPU/promote-constOffset-to-imm.ll | 251 +-
llvm/test/CodeGen/AMDGPU/rem_i128.ll | 94 +-
.../CodeGen/AMDGPU/remat-fp64-constants.ll | 4 +-
.../AMDGPU/resource-optimization-remarks.ll | 6 +-
llvm/test/CodeGen/AMDGPU/rsq.f64.ll | 218 +-
...dleMoveUp-subreg-def-across-subreg-def.mir | 4 +-
.../AMDGPU/schedule-amdgpu-trackers.ll | 14 +-
llvm/test/CodeGen/AMDGPU/schedule-barrier.mir | 18 +-
.../schedule-regpressure-limit-clustering.ll | 2 +-
.../AMDGPU/schedule-relaxed-occupancy.ll | 12 +-
llvm/test/CodeGen/AMDGPU/sdiv.ll | 408 +-
llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 14 +-
llvm/test/CodeGen/AMDGPU/select.f16.ll | 368 +-
llvm/test/CodeGen/AMDGPU/shift-i128.ll | 36 +-
llvm/test/CodeGen/AMDGPU/shl.ll | 22 +-
llvm/test/CodeGen/AMDGPU/sra.ll | 44 +-
llvm/test/CodeGen/AMDGPU/srem.ll | 232 +-
llvm/test/CodeGen/AMDGPU/srl.ll | 22 +-
llvm/test/CodeGen/AMDGPU/ssubsat.ll | 20 +-
llvm/test/CodeGen/AMDGPU/udiv.ll | 36 +-
93 files changed, 16171 insertions(+), 16578 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 737b2f740d6f77..bdf12ccb302cbc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -455,7 +455,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
uint64_t NumSGPRsForWavesPerEU = std::max(
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
- STM.computeOccupancy(F, MFI.getLDSSize()),
+ STM.getOccupancyWithWorkGroupSizes(*MF).second,
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
OutContext);
@@ -1262,7 +1262,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}
ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
- STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
+ STM.computeOccupancy(F, ProgInfo.LDSSize).second, ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
const auto [MinWEU, MaxWEU] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e27ef71c1c0883..907f82ed7fc528 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -1344,7 +1344,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
}
unsigned MaxOccupancy =
- ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F);
+ ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second;
// Restrict local memory usage so that we don't drastically reduce occupancy,
// unless it is already significantly reduced.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index ae563df2a7a128..da729d4dc7e089 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -55,55 +55,92 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
return getLocalMemorySize() / WorkGroupsPerCU;
}
-// FIXME: Should return min,max range.
-//
-// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
-// be achieved when only the given function is running on the machine; and
-// taking into account the overall number of wave slots, the (maximum) workgroup
-// size, and the per-workgroup LDS allocation size.
-unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
- const Function &F) const {
- const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
- const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
- if (!MaxWorkGroupsPerCu)
- return 0;
-
- const unsigned WaveSize = getWavefrontSize();
-
- // FIXME: Do we need to account for alignment requirement of LDS rounding the
- // size up?
- // Compute restriction based on LDS usage
- unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
-
- // This can be queried with more LDS than is possible, so just assume the
- // worst.
- if (NumGroups == 0)
- return 1;
-
- NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
-
- // Round to the number of waves per CU.
- const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
- unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
-
- // Number of waves per EU (SIMD).
- MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
-
- // Clamp to the maximum possible number of waves.
- MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
+std::pair<unsigned, unsigned>
+AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
+ const Function &F) const {
+ // FIXME: Is there an allocation granularity for the LDS? If so we would need
+ // to make sure the amount of bytes is aligned on that granularity.
+
+ // Compute occupancy restriction based on LDS usage.
+ const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
+
+ // Queried LDS size may be larger than available on a CU, in which case we
+ // consider the only achievable occupancy to be 1, in line with what we
+ // consider the occupancy to be when the number of requested registers in a
+ // particular bank is higher than the number of available ones in that bank.
+ if (!MaxWGsLDS)
+ return {1, 1};
+
+ const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
+ const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
+
+ auto PropsFromWGSize = [&](unsigned WGSize)
+ -> std::tuple<const unsigned, const unsigned, unsigned> {
+ unsigned WavesPerWG = divideCeil(WGSize, WaveSize);
+ unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS);
+ return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
+ };
+
+ // The maximum group size will generally yield the minimum number of
+ // workgroups, maximum number of waves, and minimum occupancy. The opposite is
+ // generally true for the minimum group size. LDS or barrier ressource
+ // limitations can flip those minimums/maximums.
+ const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F);
+ auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
+ auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
+
+ // It is possible that we end up with flipped minimum and maximum number of
+ // waves per CU when the number of minimum/maximum concurrent groups on the CU
+ // is limited by LDS usage or barrier ressources.
+ if (MinWavesPerCU >= MaxWavesPerCU) {
+ std::swap(MinWavesPerCU, MaxWavesPerCU);
+ } else {
+ // Look for a potential smaller group size than the maximum which decreases
+ // the concurrent number of waves on the CU for the same number of
+ // concurrent workgroups on the CU.
+ unsigned MinWavesPerCUForWGSize =
+ divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU;
+ if (MinWavesPerCU > MinWavesPerCUForWGSize) {
+ unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
+ if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
+ // There may exist a smaller group size than the maximum that achieves
+ // the minimum number of waves per CU. This group size is the largest
+ // possible size that requires MaxWavesPerWG - E waves where E is
+ // maximized under the following constraints.
+ // 1. 0 <= E <= ExcessSlotsPerWG
+ // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize
+ MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG,
+ MaxWavesPerWG - MinWavesPerWG);
+ }
+ }
- // FIXME: Needs to be a multiple of the group size?
- //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
+ // Look for a potential larger group size than the minimum which increases
+ // the concurrent number of waves on the CU for the same number of
+ // concurrent workgroups on the CU.
+ unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
+ if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
+ // There may exist a larger group size than the minimum that achieves the
+ // maximum number of waves per CU. This group size is the smallest
+ // possible size that requires MinWavesPerWG + L waves where L is
+ // maximized under the following constraints.
+ // 1. 0 <= L <= LeftoverSlotsPerWG
+ // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize
+ MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG,
+ ((MaxWGSize - 1) / WaveSize) + 1 -
+ MinWavesPerWG);
+ }
+ }
- assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
- "computed invalid occupancy");
- return MaxWaves;
+ // Return the minimum/maximum number of waves on any EU, assuming that all
+ // wavefronts are spread across all EUs as evenly as possible.
+ return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU),
+ std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)};
}
-unsigned
-AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
+ const MachineFunction &MF) const {
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
- return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
+ return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction());
}
std::pair<unsigned, unsigned>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 7701fef5365841..5944b69ce64162 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -127,11 +127,21 @@ class AMDGPUSubtarget {
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
const Function &) const;
- /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
- /// the given LDS memory size is the only constraint.
- unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
+ /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
+ /// be achieved when the only function running on a CU is \p F and each
+ /// workgroup running the function requires \p LDSBytes bytes of LDS space.
+ /// This notably depends on the range of allowed flat group sizes for the
+ /// function and hardware characteristics.
+ std::pair<unsigned, unsigned>
+ getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;
- unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
+ /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
+ /// be achieved when the only function running on a CU is \p MF. This notably
+ /// depends on the range of allowed flat group sizes for the function, the
+ /// amount of per-workgroup LDS space required by the function, and hardware
+ /// characteristics.
+ std::pair<unsigned, unsigned>
+ getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
bool isAmdHsaOS() const {
return TargetTriple.getOS() == Triple::AMDHSA;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f8b60630bb7f6c..05acd418a1cd0d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1717,7 +1717,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
if (MFI->Occupancy == 0) {
// Fixup the subtarget dependent default value.
- MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
+ MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
}
auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index f5bbc5482d347c..b00105ae9bd528 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1089,9 +1089,8 @@ bool PreRARematStage::initGCNSchedStage() {
return false;
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
- // Check maximum occupancy
- if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
- DAG.MinOccupancy)
+ // Rematerialization will not help if occupancy is not limited by reg usage.
+ if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy)
return false;
// FIXME: This pass will invalidate cached MBBLiveIns for regions
@@ -1272,8 +1271,8 @@ void GCNSchedStage::checkScheduling() {
return;
}
- unsigned TargetOccupancy =
- std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
+ unsigned TargetOccupancy = std::min(
+ S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
unsigned WavesAfter =
std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
unsigned WavesBefore =
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 117afc4a8e8c60..22a550450dc2eb 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -405,16 +405,16 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
}
-unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
- unsigned NumSGPRs,
- unsigned NumVGPRs) const {
- unsigned Occupancy =
- std::min(getMaxWavesPerEU(), getOccupancyWithLocalMemSize(LDSSize, F));
- if (NumSGPRs)
- Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
- if (NumVGPRs)
- Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
- return Occupancy;
+std::pair<unsigned, unsigned>
+GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
+ unsigned NumSGPRs, unsigned NumVGPRs) const {
+ auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
+ unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
+ unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);
+
+ // Maximum occupancy may be further limited by high SGPR/VGPR usage.
+ MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
+ return {std::min(MinOcc, MaxOcc), MaxOcc};
}
unsigned GCNSubtarget::getBaseMaxNumSGPRs(
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 3388bc3c5a8de1..a22e413508021d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1368,12 +1368,18 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// VGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
- /// Return occupancy for the given function. Used LDS and a number of
- /// registers if provided.
- /// Note, occupancy can be affected by the scratch allocation as well, but
+ /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
+ /// be achieved when the only function running on a CU is \p F, each workgroup
+ /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
+ /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
+ /// range, so this returns a range as well.
+ ///
+ /// Note that occupancy can be affected by the scratch allocation as well, but
/// we do not have enough information to compute it.
- unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
- unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
+ std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
+ unsigned LDSSize = 0,
+ unsigned NumSGPRs = 0,
+ unsigned NumVGPRs = 0) const;
/// \returns true if the flat_scratch register should be initialized with the
/// pointer to the wave's scratch memory rather than a size and offset.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 169f1369fb5433..b73af929409064 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -48,7 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
assert(MaxNumWorkGroups.size() == 3);
- Occupancy = ST.computeOccupancy(F, getLDSSize());
+ Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
CallingConv::ID CC = F.getCallingConv();
VRegFlags.reserve(1024);
@@ -185,8 +185,7 @@ MachineFunctionInfo *SIMachineFunctionInfo::clone(
void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
limitOccupancy(getMaxWavesPerEU());
const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
- limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
- MF.getFunction()));
+ limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second);
}
Register SIMachineFunctionInfo::addPrivateSegmentBuffer(
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 704435dad65d7b..11121e6058770f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3642,18 +3642,15 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
- unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
- MF.getFunction());
+ unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
switch (RC->getID()) {
default:
return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
case AMDGPU::VGPR_32RegClassID:
- return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
+ return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF));
case AMDGPU::SGPR_32RegClassID:
case AMDGPU::SGPR_LO16RegClassID:
- return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
+ return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
index ab95c226b08b02..27b93872b9f1df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
@@ -513,29 +513,29 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v16, v[0:1]
+; GFX8-NEXT: flat_load_ushort v14, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4
-; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u16_e32 v1, v6, v10
; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_u16_e32 v3, v7, v11
-; GFX8-NEXT: v_add_u16_sdwa v6, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v7, v8, v12
+; GFX8-NEXT: v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u16_e32 v11, v8, v12
; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v10, v9, v13
+; GFX8-NEXT: v_add_u16_e32 v12, v9, v13
; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v11, v16, v0
+; GFX8-NEXT: v_add_u16_e32 v13, v14, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v2
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v6
-; GFX8-NEXT: v_or_b32_e32 v2, v7, v8
-; GFX8-NEXT: v_or_b32_e32 v3, v10, v9
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v10
+; GFX8-NEXT: v_or_b32_e32 v2, v11, v8
+; GFX8-NEXT: v_or_b32_e32 v3, v12, v9
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: flat_store_short v[14:15], v11
+; GFX8-NEXT: flat_store_short v[6:7], v13
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -661,55 +661,55 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
; GFX8-LABEL: add_v11i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 16, v0
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, 18, v0
-; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v18, v[10:11]
-; GFX8-NEXT: flat_load_ushort v19, v[12:13]
-; GFX8-NEXT: flat_load_ushort v20, v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 18, v2
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v2
; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v2
+; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: flat_load_ushort v1, v[14:15]
-; GFX8-NEXT: flat_load_ushort v2, v[2:3]
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4
-; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v4
-; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_ushort v14, v[14:15]
+; GFX8-NEXT: flat_load_ushort v15, v[16:17]
+; GFX8-NEXT: flat_load_ushort v16, v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_add_u16_e32 v3, v6, v10
+; GFX8-NEXT: v_add_u16_e32 v17, v6, v10
; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v21, v7, v11
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v0
+; GFX8-NEXT: v_add_u16_e32 v18, v7, v11
; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v22, v8, v12
-; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v12, v9, v13
-; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 20, v4
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0
+; GFX8-NEXT: flat_load_ushort v2, v[2:3]
+; GFX8-NEXT: flat_load_ushort v3, v[6:7]
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v21, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
+; GFX8-NEXT: v_add_u16_e32 v19, v8, v12
+; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v4
+; GFX8-NEXT: v_add_u16_e32 v20, v9, v13
+; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GFX8-NEXT: v_or_b32_e32 v0, v17, v10
+; GFX8-NEXT: v_or_b32_e32 v1, v18, v11
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 20, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u16_e32 v13, v18, v0
+; GFX8-NEXT: v_add_u16_e32 v14, v2, v14
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u16_e32 v18, v19, v1
+; GFX8-NEXT: v_add_u16_e32 v15, v3, v15
+; GFX8-NEXT: v_or_b32_e32 v2, v19, v12
+; GFX8-NEXT: v_or_b32_e32 v3, v20, v13
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v19, v20, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v3, v10
-; GFX8-NEXT: v_or_b32_e32 v1, v21, v11
-; GFX8-NEXT: v_or_b32_e32 v2, v22, v8
-; GFX8-NEXT: v_or_b32_e32 v3, v12, v9
+; GFX8-NEXT: v_add_u16_e32 v16, v21, v16
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: flat_store_short v[14:15], v13
-; GFX8-NEXT: flat_store_short v[16:17], v18
-; GFX8-NEXT: flat_store_short v[6:7], v19
+; GFX8-NEXT: flat_store_short v[6:7], v14
+; GFX8-NEXT: flat_store_short v[8:9], v15
+; GFX8-NEXT: flat_store_short v[10:11], v16
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -794,34 +794,34 @@ define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u16_e32 v0, v6, v10
-; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v2, v7, v11
-; GFX8-NEXT: v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v6, v8, v12
-; GFX8-NEXT: v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v8, v9, v13
+; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[2:3]
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_add_u16_e32 v2, v6, v10
+; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u16_e32 v10, v7, v11
+; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: v_add_u16_e32 v16, v8, v12
+; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u16_e32 v12, v9, v13
; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX8-NEXT: v_or_b32_e32 v2, v6, v7
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v9
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v6, v14, v16
-; GFX8-NEXT: v_add_u16_sdwa v7, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v8, v15, v17
-; GFX8-NEXT: v_add_u16_sdwa v9, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v10, v11
+; GFX8-NEXT: v_or_b32_e32 v2, v16, v8
+; GFX8-NEXT: v_or_b32_e32 v3, v12, v9
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_add_u16_e32 v8, v6, v14
+; GFX8-NEXT: v_add_u16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u16_e32 v9, v7, v15
+; GFX8-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4
-; GFX8-NEXT: v_or_b32_e32 v7, v8, v9
+; GFX8-NEXT: v_or_b32_e32 v6, v8, v6
+; GFX8-NEXT: v_or_b32_e32 v7, v9, v7
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; GFX8-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
index 0b66185d25f3e2..8db1f46b0342a3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
@@ -712,33 +712,33 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
-; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1]
+; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3]
; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17
+; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
+; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15]
; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19
; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13]
-; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11]
-; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
-; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17]
-; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
-; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
-; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9
-; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19]
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9]
-; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15]
-; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13
+; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15
+; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17]
+; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11]
+; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19]
+; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0
+; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17]
+; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3]
; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1]
-; GFX6-NEXT: s_nop 0
-; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17]
+; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13
+; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13]
+; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX6-NEXT: s_nop 1
+; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17]
; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3]
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -747,26 +747,26 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
-; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
-; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
-; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13]
-; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15]
-; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
-; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
-; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
+; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
+; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
+; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13]
+; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
+; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
+; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
; GFX8-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
+; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15]
; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
+; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
+; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -775,26 +775,26 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
-; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
-; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
-; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13]
-; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15]
-; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
-; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
-; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
+; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
+; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
+; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13]
+; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
+; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
+; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
; GFX9-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
+; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15]
; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
+; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
+; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -945,33 +945,33 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
-; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1]
+; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3]
; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17
+; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
+; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15]
; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19
; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13]
-; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11]
-; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
-; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17]
-; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
-; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
-; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9
-; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19]
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9]
-; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15]
-; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13
+; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15
+; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17]
+; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11]
+; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19]
+; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0
+; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17]
+; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3]
; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1]
-; GFX6-NEXT: s_nop 0
-; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17]
+; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13
+; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13]
+; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX6-NEXT: s_nop 1
+; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17]
; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3]
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -980,26 +980,26 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
-; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
-; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
-; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13]
-; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15]
-; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
-; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
-; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
+; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
+; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
+; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13]
+; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
+; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
+; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
; GFX8-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
+; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15]
; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
+; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
+; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1008,26 +1008,26 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
-; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
-; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
-; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13]
-; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15]
-; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
-; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
-; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
+; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
+; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
+; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13]
+; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
+; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
+; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
; GFX9-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
+; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15]
; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
+; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
+; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1106,7 +1106,7 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000
+; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000
; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
@@ -1115,23 +1115,23 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0
; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18
; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7]
-; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
-; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9]
-; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5
-; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15]
+; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9]
+; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0
+; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
+; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15]
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5]
-; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13]
-; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20
+; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5]
+; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13]
+; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
; GFX6-NEXT: s_nop 0
-; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15]
; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -1266,7 +1266,7 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000
+; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000
; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
@@ -1275,23 +1275,23 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0
; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18
; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7]
-; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
-; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9]
-; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5
-; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15]
+; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9]
+; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0
+; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
+; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15]
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5]
-; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13]
-; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20
+; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5]
+; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13]
+; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
; GFX6-NEXT: s_nop 0
-; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15]
; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -1493,7 +1493,7 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000
+; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000
; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
@@ -1502,23 +1502,23 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0
; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18
; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7]
-; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
-; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9]
-; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5
-; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15]
+; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9]
+; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0
+; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
+; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15]
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5]
-; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13]
-; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20
+; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5]
+; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13]
+; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
; GFX6-NEXT: s_nop 0
-; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15]
; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -1725,33 +1725,33 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
-; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1]
+; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3]
; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17
+; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
+; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15]
; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19
; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13]
-; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11]
-; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
-; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17]
-; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
-; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
-; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9
-; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19]
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9]
-; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15]
-; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13
+; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15
+; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17]
+; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11]
+; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19]
+; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0
+; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17]
+; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3]
; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1]
-; GFX6-NEXT: s_nop 0
-; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17]
+; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13
+; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13]
+; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX6-NEXT: s_nop 1
+; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17]
; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3]
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -1760,26 +1760,26 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
-; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
-; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
-; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13]
-; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15]
-; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
-; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
-; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
+; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
+; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
+; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13]
+; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
+; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
+; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
; GFX8-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
+; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15]
; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
+; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
+; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1788,26 +1788,26 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1]
; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0
-; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0
-; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15]
-; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13]
-; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15]
-; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
-; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21]
-; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
+; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1]
+; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
+; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13]
+; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19]
+; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3]
+; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17]
; GFX9-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
+; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15]
; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
+; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19]
+; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21]
; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 5d76b542fad894..e60739fd84059b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -7678,274 +7678,274 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6-LABEL: v_fshl_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16
-; GFX6-NEXT: v_not_b32_e32 v25, 63
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v19
-; GFX6-NEXT: v_add_i32_e32 v26, vcc, v19, v25
+; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23
; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17
-; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19
-; GFX6-NEXT: v_lshl_b64 v[23:24], v[0:1], v19
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v26
-; GFX6-NEXT: v_or_b32_e32 v17, v17, v21
-; GFX6-NEXT: v_or_b32_e32 v18, v18, v22
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX6-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
-; GFX6-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10
-; GFX6-NEXT: v_not_b32_e32 v8, v16
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1
-; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v8
-; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v23
-; GFX6-NEXT: v_add_i32_e32 v24, vcc, v23, v25
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v23
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v10
-; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v23
-; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v24
-; GFX6-NEXT: v_or_b32_e32 v8, v8, v10
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], 1
+; GFX6-NEXT: v_not_b32_e32 v16, v16
+; GFX6-NEXT: v_or_b32_e32 v21, v17, v21
+; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10
+; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1
+; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16
+; GFX6-NEXT: v_or_b32_e32 v9, v9, v17
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24
+; GFX6-NEXT: v_or_b32_e32 v22, v18, v22
+; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16
+; GFX6-NEXT: v_lshr_b64 v[18:19], v[8:9], v24
+; GFX6-NEXT: v_not_b32_e32 v25, 63
+; GFX6-NEXT: v_or_b32_e32 v18, v18, v16
+; GFX6-NEXT: v_add_i32_e32 v16, vcc, v23, v25
+; GFX6-NEXT: v_or_b32_e32 v19, v19, v17
+; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], v16
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX6-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5]
+; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v24, v25
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v0
+; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX6-NEXT: v_or_b32_e32 v2, v18, v2
-; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v18
-; GFX6-NEXT: v_or_b32_e32 v3, v19, v3
-; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18
-; GFX6-NEXT: v_lshl_b64 v[16:17], v[4:5], v18
-; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v19
-; GFX6-NEXT: v_or_b32_e32 v8, v8, v10
-; GFX6-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
-; GFX6-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc
+; GFX6-NEXT: v_or_b32_e32 v0, v26, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v17, v8
+; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v20
+; GFX6-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5]
+; GFX6-NEXT: v_or_b32_e32 v1, v18, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 64, v17
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v3
+; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v17
+; GFX6-NEXT: v_or_b32_e32 v3, v16, v19
+; GFX6-NEXT: v_add_i32_e32 v16, vcc, v17, v25
+; GFX6-NEXT: v_or_b32_e32 v10, v8, v10
+; GFX6-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v17
+; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v16
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GFX6-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc
; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14
; GFX6-NEXT: v_not_b32_e32 v8, v20
; GFX6-NEXT: v_or_b32_e32 v5, v5, v6
; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1
-; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8
-; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v14
-; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v25
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v14
+; GFX6-NEXT: v_and_b32_e32 v12, 0x7f, v8
+; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v12
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v12
; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10
-; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v14
-; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v15
-; GFX6-NEXT: v_or_b32_e32 v8, v8, v10
-; GFX6-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX6-NEXT: v_add_i32_e32 v13, vcc, v12, v25
+; GFX6-NEXT: v_or_b32_e32 v10, v8, v10
+; GFX6-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v12
+; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v13
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12
+; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX6-NEXT: v_or_b32_e32 v1, v22, v1
+; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
; GFX6-NEXT: v_or_b32_e32 v4, v16, v4
-; GFX6-NEXT: v_or_b32_e32 v5, v17, v5
-; GFX6-NEXT: v_or_b32_e32 v6, v18, v6
+; GFX6-NEXT: v_or_b32_e32 v5, v18, v5
+; GFX6-NEXT: v_or_b32_e32 v6, v17, v6
; GFX6-NEXT: v_or_b32_e32 v7, v19, v7
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16
-; GFX8-NEXT: v_not_b32_e32 v25, 63
-; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v19
-; GFX8-NEXT: v_add_u32_e32 v26, vcc, v19, v25
+; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16
+; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23
; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[0:1]
-; GFX8-NEXT: v_or_b32_e32 v17, v17, v21
-; GFX8-NEXT: v_or_b32_e32 v18, v18, v22
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9]
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10
-; GFX8-NEXT: v_not_b32_e32 v8, v16
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11]
-; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v8
-; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v23
-; GFX8-NEXT: v_add_u32_e32 v24, vcc, v23, v25
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3]
-; GFX8-NEXT: v_or_b32_e32 v8, v8, v10
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
+; GFX8-NEXT: v_not_b32_e32 v16, v16
+; GFX8-NEXT: v_or_b32_e32 v21, v17, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10
+; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
+; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16
+; GFX8-NEXT: v_or_b32_e32 v9, v9, v17
+; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24
+; GFX8-NEXT: v_or_b32_e32 v22, v18, v22
+; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11]
+; GFX8-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9]
+; GFX8-NEXT: v_not_b32_e32 v25, 63
+; GFX8-NEXT: v_or_b32_e32 v18, v18, v16
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, v23, v25
+; GFX8-NEXT: v_or_b32_e32 v19, v19, v17
+; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1]
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX8-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5]
+; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v24, v25
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11]
+; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11]
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX8-NEXT: v_or_b32_e32 v2, v18, v2
-; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v18
-; GFX8-NEXT: v_or_b32_e32 v3, v19, v3
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7]
-; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5]
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5]
-; GFX8-NEXT: v_or_b32_e32 v8, v8, v10
-; GFX8-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc
+; GFX8-NEXT: v_or_b32_e32 v0, v26, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v17, v8
+; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v20
+; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5]
+; GFX8-NEXT: v_or_b32_e32 v1, v18, v3
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 64, v17
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5]
+; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7]
+; GFX8-NEXT: v_or_b32_e32 v3, v16, v19
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, v17, v25
+; GFX8-NEXT: v_or_b32_e32 v10, v8, v10
+; GFX8-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5]
+; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc
; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13]
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14
; GFX8-NEXT: v_not_b32_e32 v8, v20
; GFX8-NEXT: v_or_b32_e32 v5, v5, v6
; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8
-; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v14
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v25
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5]
+; GFX8-NEXT: v_and_b32_e32 v12, 0x7f, v8
+; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v12
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[4:5]
; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7]
-; GFX8-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7]
-; GFX8-NEXT: v_or_b32_e32 v8, v8, v10
-; GFX8-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v12, v25
+; GFX8-NEXT: v_or_b32_e32 v10, v8, v10
+; GFX8-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[6:7]
+; GFX8-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX8-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v22, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
; GFX8-NEXT: v_or_b32_e32 v4, v16, v4
-; GFX8-NEXT: v_or_b32_e32 v5, v17, v5
-; GFX8-NEXT: v_or_b32_e32 v6, v18, v6
+; GFX8-NEXT: v_or_b32_e32 v5, v18, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v17, v6
; GFX8-NEXT: v_or_b32_e32 v7, v19, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshl_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v16
-; GFX9-NEXT: v_sub_u32_e32 v17, 64, v19
-; GFX9-NEXT: v_add_u32_e32 v25, 0xffffffc0, v19
+; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
+; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23
+; GFX9-NEXT: v_not_b32_e32 v16, v16
; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
-; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1]
-; GFX9-NEXT: v_or_b32_e32 v17, v17, v21
-; GFX9-NEXT: v_or_b32_e32 v18, v18, v22
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v1, v18, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9]
-; GFX9-NEXT: v_not_b32_e32 v8, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v19, v17, v3, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11]
-; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v8
-; GFX9-NEXT: v_lshl_or_b32 v1, v10, 31, v1
-; GFX9-NEXT: v_sub_u32_e32 v10, 64, v23
-; GFX9-NEXT: v_add_u32_e32 v24, 0xffffffc0, v23
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3]
-; GFX9-NEXT: v_or_b32_e32 v8, v8, v10
+; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3]
+; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9
+; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
+; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16
+; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24
+; GFX9-NEXT: v_or_b32_e32 v21, v17, v21
+; GFX9-NEXT: v_or_b32_e32 v22, v18, v22
+; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11]
+; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9]
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT: v_or_b32_e32 v18, v18, v16
+; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v23
+; GFX9-NEXT: v_or_b32_e32 v19, v19, v17
+; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v24
+; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5]
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11]
+; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT: v_or_b32_e32 v2, v18, v2
-; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18
-; GFX9-NEXT: v_or_b32_e32 v3, v19, v3
-; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5]
+; GFX9-NEXT: v_or_b32_e32 v1, v18, v3
+; GFX9-NEXT: v_or_b32_e32 v3, v16, v9
+; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20
+; GFX9-NEXT: v_or_b32_e32 v0, v25, v2
+; GFX9-NEXT: v_or_b32_e32 v2, v17, v8
+; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16
; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5]
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7]
-; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5]
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5]
-; GFX9-NEXT: v_or_b32_e32 v8, v8, v10
-; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7]
+; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16
+; GFX9-NEXT: v_or_b32_e32 v10, v8, v10
+; GFX9-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5]
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v11, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc
; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13]
-; GFX9-NEXT: v_cndmask_b32_e32 v19, v8, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v7, vcc
; GFX9-NEXT: v_not_b32_e32 v8, v20
-; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5
; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8
-; GFX9-NEXT: v_sub_u32_e32 v10, 64, v14
-; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5]
+; GFX9-NEXT: v_and_b32_e32 v13, 0x7f, v8
+; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5
+; GFX9-NEXT: v_sub_u32_e32 v10, 64, v13
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX9-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7]
-; GFX9-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7]
-; GFX9-NEXT: v_or_b32_e32 v8, v8, v10
-; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX9-NEXT: v_add_u32_e32 v14, 0xffffffc0, v13
+; GFX9-NEXT: v_or_b32_e32 v10, v8, v10
+; GFX9-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[6:7]
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13
; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX9-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX9-NEXT: v_or_b32_e32 v1, v22, v1
-; GFX9-NEXT: v_or_b32_e32 v4, v16, v4
-; GFX9-NEXT: v_or_b32_e32 v5, v17, v5
-; GFX9-NEXT: v_or_b32_e32 v6, v18, v6
-; GFX9-NEXT: v_or_b32_e32 v7, v19, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX9-NEXT: v_or_b32_e32 v4, v17, v4
+; GFX9-NEXT: v_or_b32_e32 v5, v18, v5
+; GFX9-NEXT: v_or_b32_e32 v6, v16, v6
+; GFX9-NEXT: v_or_b32_e32 v7, v12, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fshl_v2i128:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index dbc8f12c2c25c4..36a6614a5620cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -7719,86 +7719,86 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6-NEXT: v_not_b32_e32 v0, v16
; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19
-; GFX6-NEXT: v_not_b32_e32 v25, 63
; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0
; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19
-; GFX6-NEXT: v_add_i32_e32 v26, vcc, v19, v25
-; GFX6-NEXT: v_lshl_b64 v[23:24], v[17:18], v19
-; GFX6-NEXT: v_or_b32_e32 v21, v0, v21
-; GFX6-NEXT: v_or_b32_e32 v22, v1, v22
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v26
+; GFX6-NEXT: v_and_b32_e32 v25, 0x7f, v16
+; GFX6-NEXT: v_or_b32_e32 v23, v0, v21
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v25
+; GFX6-NEXT: v_or_b32_e32 v24, v1, v22
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[10:11], v0
+; GFX6-NEXT: v_lshr_b64 v[21:22], v[8:9], v25
+; GFX6-NEXT: v_not_b32_e32 v26, 63
+; GFX6-NEXT: v_or_b32_e32 v21, v21, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v19, v26
+; GFX6-NEXT: v_or_b32_e32 v22, v22, v1
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v0
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
-; GFX6-NEXT: v_and_b32_e32 v22, 0x7f, v16
-; GFX6-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v22
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v22
-; GFX6-NEXT: v_lshl_b64 v[2:3], v[10:11], v2
-; GFX6-NEXT: v_add_i32_e32 v24, vcc, v22, v25
-; GFX6-NEXT: v_or_b32_e32 v2, v0, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5]
+; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v25, v26
+; GFX6-NEXT: v_lshl_b64 v[16:17], v[17:18], v19
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v0
+; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
+; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v25
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
+; GFX6-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc
; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5]
+; GFX6-NEXT: v_or_b32_e32 v0, v16, v8
+; GFX6-NEXT: v_or_b32_e32 v1, v17, v9
; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT: v_lshr_b64 v[16:17], v[10:11], v22
; GFX6-NEXT: v_or_b32_e32 v6, v6, v4
; GFX6-NEXT: v_not_b32_e32 v4, v20
-; GFX6-NEXT: v_or_b32_e32 v0, v18, v0
-; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v18
+; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v4
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v16
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v10
; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18
-; GFX6-NEXT: v_or_b32_e32 v2, v19, v2
-; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25
-; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v18
+; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16
+; GFX6-NEXT: v_add_i32_e32 v17, vcc, v16, v26
; GFX6-NEXT: v_or_b32_e32 v10, v4, v10
; GFX6-NEXT: v_or_b32_e32 v11, v5, v11
-; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v19
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
-; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX6-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc
-; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v18
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v18
+; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v16
+; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v17
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v18
+; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v20
+; GFX6-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v10
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v10
; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6
-; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25
-; GFX6-NEXT: v_or_b32_e32 v6, v4, v6
-; GFX6-NEXT: v_or_b32_e32 v7, v5, v7
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v19
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[14:15], v18
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX6-NEXT: v_or_b32_e32 v1, v23, v1
-; GFX6-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX6-NEXT: v_or_b32_e32 v4, v16, v4
-; GFX6-NEXT: v_or_b32_e32 v5, v17, v5
-; GFX6-NEXT: v_or_b32_e32 v6, v10, v6
-; GFX6-NEXT: v_or_b32_e32 v7, v11, v7
+; GFX6-NEXT: v_add_i32_e32 v11, vcc, v10, v26
+; GFX6-NEXT: v_or_b32_e32 v16, v4, v6
+; GFX6-NEXT: v_or_b32_e32 v19, v5, v7
+; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v11
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v10
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc
+; GFX6-NEXT: v_or_b32_e32 v4, v17, v6
+; GFX6-NEXT: v_or_b32_e32 v5, v18, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v8, v10
+; GFX6-NEXT: v_or_b32_e32 v7, v9, v11
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_v2i128:
@@ -7811,86 +7811,86 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX8-NEXT: v_not_b32_e32 v0, v16
; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19
-; GFX8-NEXT: v_not_b32_e32 v25, 63
; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18]
; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
-; GFX8-NEXT: v_add_u32_e32 v26, vcc, v19, v25
-; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18]
-; GFX8-NEXT: v_or_b32_e32 v21, v0, v21
-; GFX8-NEXT: v_or_b32_e32 v22, v1, v22
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[17:18]
+; GFX8-NEXT: v_and_b32_e32 v25, 0x7f, v16
+; GFX8-NEXT: v_or_b32_e32 v23, v0, v21
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v25
+; GFX8-NEXT: v_or_b32_e32 v24, v1, v22
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11]
+; GFX8-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9]
+; GFX8-NEXT: v_not_b32_e32 v26, 63
+; GFX8-NEXT: v_or_b32_e32 v21, v21, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v26
+; GFX8-NEXT: v_or_b32_e32 v22, v22, v1
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18]
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
-; GFX8-NEXT: v_and_b32_e32 v22, 0x7f, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v22
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9]
-; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX8-NEXT: v_add_u32_e32 v24, vcc, v22, v25
-; GFX8-NEXT: v_or_b32_e32 v2, v0, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5]
+; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v25, v26
+; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18]
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11]
+; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11]
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc
; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5]
+; GFX8-NEXT: v_or_b32_e32 v0, v16, v8
+; GFX8-NEXT: v_or_b32_e32 v1, v17, v9
; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11]
; GFX8-NEXT: v_or_b32_e32 v6, v6, v4
; GFX8-NEXT: v_not_b32_e32 v4, v20
-; GFX8-NEXT: v_or_b32_e32 v0, v18, v0
-; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v18
+; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v4
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v16
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v10
; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7]
-; GFX8-NEXT: v_or_b32_e32 v2, v19, v2
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25
-; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9]
+; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7]
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, v16, v26
; GFX8-NEXT: v_or_b32_e32 v10, v4, v10
; GFX8-NEXT: v_or_b32_e32 v11, v5, v11
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
-; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v18
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13]
+; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9]
+; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc
+; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v10
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13]
; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25
-; GFX8-NEXT: v_or_b32_e32 v6, v4, v6
-; GFX8-NEXT: v_or_b32_e32 v7, v5, v7
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15]
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v18, v[14:15]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX8-NEXT: v_or_b32_e32 v1, v23, v1
-; GFX8-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX8-NEXT: v_or_b32_e32 v4, v16, v4
-; GFX8-NEXT: v_or_b32_e32 v5, v17, v5
-; GFX8-NEXT: v_or_b32_e32 v6, v10, v6
-; GFX8-NEXT: v_or_b32_e32 v7, v11, v7
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v10, v26
+; GFX8-NEXT: v_or_b32_e32 v16, v4, v6
+; GFX8-NEXT: v_or_b32_e32 v19, v5, v7
+; GFX8-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15]
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc
+; GFX8-NEXT: v_or_b32_e32 v4, v17, v6
+; GFX8-NEXT: v_or_b32_e32 v5, v18, v7
+; GFX8-NEXT: v_or_b32_e32 v6, v8, v10
+; GFX8-NEXT: v_or_b32_e32 v7, v9, v11
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_v2i128:
@@ -7905,83 +7905,83 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18]
; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
-; GFX9-NEXT: v_add_u32_e32 v25, 0xffffffc0, v19
-; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18]
-; GFX9-NEXT: v_or_b32_e32 v21, v0, v21
-; GFX9-NEXT: v_or_b32_e32 v22, v1, v22
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18]
+; GFX9-NEXT: v_and_b32_e32 v25, 0x7f, v16
+; GFX9-NEXT: v_or_b32_e32 v23, v0, v21
+; GFX9-NEXT: v_sub_u32_e32 v0, 64, v25
+; GFX9-NEXT: v_or_b32_e32 v24, v1, v22
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11]
+; GFX9-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9]
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
-; GFX9-NEXT: v_and_b32_e32 v22, 0x7f, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc
-; GFX9-NEXT: v_sub_u32_e32 v2, 64, v22
-; GFX9-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9]
-; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX9-NEXT: v_add_u32_e32 v24, 0xffffffc0, v22
-; GFX9-NEXT: v_or_b32_e32 v2, v0, v2
-; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
+; GFX9-NEXT: v_or_b32_e32 v21, v21, v0
+; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v19
+; GFX9-NEXT: v_or_b32_e32 v22, v22, v1
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25
+; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5]
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11]
+; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
+; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11]
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc
; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5]
+; GFX9-NEXT: v_or_b32_e32 v0, v16, v8
+; GFX9-NEXT: v_or_b32_e32 v1, v17, v9
; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5]
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5
; GFX9-NEXT: v_or_b32_e32 v6, v6, v4
; GFX9-NEXT: v_not_b32_e32 v4, v20
-; GFX9-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11]
-; GFX9-NEXT: v_or_b32_e32 v0, v18, v0
-; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v4
-; GFX9-NEXT: v_sub_u32_e32 v4, 64, v18
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v4
+; GFX9-NEXT: v_sub_u32_e32 v4, 64, v16
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v10
+; GFX9-NEXT: v_or_b32_e32 v3, v3, v11
; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7]
-; GFX9-NEXT: v_or_b32_e32 v2, v19, v2
-; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18
-; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9]
+; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7]
+; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16
; GFX9-NEXT: v_or_b32_e32 v10, v4, v10
; GFX9-NEXT: v_or_b32_e32 v11, v5, v11
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
-; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc
-; GFX9-NEXT: v_sub_u32_e32 v6, 64, v18
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13]
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9]
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v20
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc
+; GFX9-NEXT: v_sub_u32_e32 v6, 64, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13]
; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18
-; GFX9-NEXT: v_or_b32_e32 v6, v4, v6
-; GFX9-NEXT: v_or_b32_e32 v7, v5, v7
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15]
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v18, v[14:15]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX9-NEXT: v_or_b32_e32 v1, v23, v1
-; GFX9-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX9-NEXT: v_or_b32_e32 v4, v16, v4
-; GFX9-NEXT: v_or_b32_e32 v5, v17, v5
-; GFX9-NEXT: v_or_b32_e32 v6, v10, v6
-; GFX9-NEXT: v_or_b32_e32 v7, v11, v7
+; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10
+; GFX9-NEXT: v_or_b32_e32 v16, v4, v6
+; GFX9-NEXT: v_or_b32_e32 v19, v5, v7
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15]
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc
+; GFX9-NEXT: v_or_b32_e32 v4, v17, v6
+; GFX9-NEXT: v_or_b32_e32 v5, v18, v7
+; GFX9-NEXT: v_or_b32_e32 v6, v8, v10
+; GFX9-NEXT: v_or_b32_e32 v7, v9, v11
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fshr_v2i128:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index df1afdf77983cc..298dfcf048fc46 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -715,27 +715,27 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GPRIDX-NEXT: v_mov_b32_e32 v16, s17
; GPRIDX-NEXT: v_mov_b32_e32 v17, s18
; GPRIDX-NEXT: v_mov_b32_e32 v18, s19
-; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
-; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v0, s[4:5]
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 2, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 3, v2
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v2
+; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 4, v2
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 5, v2
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v2
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[14:15], 7, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 4, v2
-; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
-; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[4:5]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[6:7]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[8:9]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[16:17]
+; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[16:17]
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[16:17]
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5]
+; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[6:7]
+; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[8:9]
; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[10:11]
; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[12:13]
; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[14:15]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[6:7]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[8:9]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[16:17]
+; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5]
+; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[6:7]
+; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[8:9]
; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[10:11]
; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[12:13]
; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[14:15]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index 75d4d8816fb30d..e8de761540b7a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -14,167 +14,168 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
; LOOP-NEXT: v_mov_b32_e32 v4, s0
; LOOP-NEXT: .LBB0_1: ; %load-store-loop
; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4
+; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc
+; LOOP-NEXT: buffer_load_ubyte v26, v[6:7], s[0:3], 0 addr64
+; LOOP-NEXT: s_waitcnt expcnt(5)
+; LOOP-NEXT: buffer_load_ubyte v29, v[6:7], s[0:3], 0 addr64 offset:1
; LOOP-NEXT: s_waitcnt expcnt(2)
-; LOOP-NEXT: v_add_i32_e32 v29, vcc, v2, v4
-; LOOP-NEXT: v_addc_u32_e32 v30, vcc, v3, v5, vcc
-; LOOP-NEXT: buffer_load_ubyte v24, v[29:30], s[0:3], 0 addr64
-; LOOP-NEXT: buffer_load_ubyte v27, v[29:30], s[0:3], 0 addr64 offset:1
-; LOOP-NEXT: buffer_load_ubyte v34, v[29:30], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT: buffer_load_ubyte v35, v[29:30], s[0:3], 0 addr64 offset:3
-; LOOP-NEXT: buffer_load_ubyte v36, v[29:30], s[0:3], 0 addr64 offset:4
-; LOOP-NEXT: buffer_load_ubyte v37, v[29:30], s[0:3], 0 addr64 offset:5
-; LOOP-NEXT: buffer_load_ubyte v38, v[29:30], s[0:3], 0 addr64 offset:6
-; LOOP-NEXT: buffer_load_ubyte v39, v[29:30], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT: buffer_load_ubyte v6, v[29:30], s[0:3], 0 addr64 offset:8
-; LOOP-NEXT: buffer_load_ubyte v9, v[29:30], s[0:3], 0 addr64 offset:9
-; LOOP-NEXT: buffer_load_ubyte v10, v[29:30], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT: buffer_load_ubyte v31, v[6:7], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT: buffer_load_ubyte v32, v[6:7], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT: buffer_load_ubyte v36, v[6:7], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT: buffer_load_ubyte v37, v[6:7], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT: buffer_load_ubyte v38, v[6:7], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT: buffer_load_ubyte v39, v[6:7], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT: buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT: buffer_load_ubyte v11, v[6:7], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT: buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:10
; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: buffer_load_ubyte v11, v[29:30], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT: buffer_load_ubyte v7, v[29:30], s[0:3], 0 addr64 offset:12
-; LOOP-NEXT: buffer_load_ubyte v13, v[29:30], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT: buffer_load_ubyte v14, v[29:30], s[0:3], 0 addr64 offset:14
-; LOOP-NEXT: buffer_load_ubyte v15, v[29:30], s[0:3], 0 addr64 offset:15
-; LOOP-NEXT: buffer_load_ubyte v8, v[29:30], s[0:3], 0 addr64 offset:16
-; LOOP-NEXT: buffer_load_ubyte v17, v[29:30], s[0:3], 0 addr64 offset:17
-; LOOP-NEXT: buffer_load_ubyte v18, v[29:30], s[0:3], 0 addr64 offset:18
-; LOOP-NEXT: buffer_load_ubyte v19, v[29:30], s[0:3], 0 addr64 offset:19
-; LOOP-NEXT: buffer_load_ubyte v12, v[29:30], s[0:3], 0 addr64 offset:20
-; LOOP-NEXT: buffer_load_ubyte v21, v[29:30], s[0:3], 0 addr64 offset:21
-; LOOP-NEXT: buffer_load_ubyte v22, v[29:30], s[0:3], 0 addr64 offset:22
-; LOOP-NEXT: buffer_load_ubyte v23, v[29:30], s[0:3], 0 addr64 offset:23
-; LOOP-NEXT: buffer_load_ubyte v16, v[29:30], s[0:3], 0 addr64 offset:24
-; LOOP-NEXT: buffer_load_ubyte v25, v[29:30], s[0:3], 0 addr64 offset:25
-; LOOP-NEXT: buffer_load_ubyte v26, v[29:30], s[0:3], 0 addr64 offset:26
-; LOOP-NEXT: buffer_load_ubyte v28, v[29:30], s[0:3], 0 addr64 offset:27
-; LOOP-NEXT: buffer_load_ubyte v20, v[29:30], s[0:3], 0 addr64 offset:28
-; LOOP-NEXT: buffer_load_ubyte v31, v[29:30], s[0:3], 0 addr64 offset:29
-; LOOP-NEXT: buffer_load_ubyte v32, v[29:30], s[0:3], 0 addr64 offset:30
-; LOOP-NEXT: buffer_load_ubyte v33, v[29:30], s[0:3], 0 addr64 offset:31
+; LOOP-NEXT: buffer_load_ubyte v13, v[6:7], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT: buffer_load_ubyte v9, v[6:7], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT: buffer_load_ubyte v15, v[6:7], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT: buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT: buffer_load_ubyte v17, v[6:7], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT: buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT: buffer_load_ubyte v19, v[6:7], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT: buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT: buffer_load_ubyte v21, v[6:7], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT: buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT: buffer_load_ubyte v23, v[6:7], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT: buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT: buffer_load_ubyte v25, v[6:7], s[0:3], 0 addr64 offset:23
+; LOOP-NEXT: buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT: buffer_load_ubyte v27, v[6:7], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT: buffer_load_ubyte v28, v[6:7], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT: buffer_load_ubyte v30, v[6:7], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT: buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT: buffer_load_ubyte v33, v[6:7], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT: buffer_load_ubyte v34, v[6:7], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT: buffer_load_ubyte v35, v[6:7], s[0:3], 0 addr64 offset:31
; LOOP-NEXT: s_waitcnt vmcnt(14)
-; LOOP-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; LOOP-NEXT: v_or_b32_e32 v24, v27, v24
-; LOOP-NEXT: v_lshlrev_b32_e32 v27, 24, v35
-; LOOP-NEXT: v_lshlrev_b32_e32 v29, 16, v34
-; LOOP-NEXT: v_or_b32_e32 v27, v27, v29
-; LOOP-NEXT: v_lshlrev_b32_e32 v29, 8, v37
-; LOOP-NEXT: v_lshlrev_b32_e32 v30, 24, v39
-; LOOP-NEXT: v_lshlrev_b32_e32 v34, 16, v38
-; LOOP-NEXT: v_or_b32_e32 v29, v29, v36
-; LOOP-NEXT: v_or_b32_e32 v30, v30, v34
-; LOOP-NEXT: v_add_i32_e32 v34, vcc, v0, v4
-; LOOP-NEXT: v_addc_u32_e32 v35, vcc, v1, v5, vcc
+; LOOP-NEXT: v_lshlrev_b32_e32 v6, 8, v29
+; LOOP-NEXT: v_or_b32_e32 v26, v6, v26
+; LOOP-NEXT: v_lshlrev_b32_e32 v6, 24, v32
+; LOOP-NEXT: v_lshlrev_b32_e32 v7, 16, v31
+; LOOP-NEXT: v_or_b32_e32 v29, v6, v7
+; LOOP-NEXT: v_lshlrev_b32_e32 v6, 8, v37
+; LOOP-NEXT: v_lshlrev_b32_e32 v7, 24, v39
+; LOOP-NEXT: v_lshlrev_b32_e32 v32, 16, v38
+; LOOP-NEXT: v_or_b32_e32 v31, v6, v36
+; LOOP-NEXT: v_or_b32_e32 v32, v7, v32
+; LOOP-NEXT: v_add_i32_e32 v6, vcc, v0, v4
+; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v1, v5, vcc
; LOOP-NEXT: v_add_i32_e32 v4, vcc, 32, v4
; LOOP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 32, v4
-; LOOP-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; LOOP-NEXT: v_lshlrev_b32_e32 v11, 24, v11
-; LOOP-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; LOOP-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; LOOP-NEXT: v_lshlrev_b32_e32 v15, 24, v15
-; LOOP-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; LOOP-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; LOOP-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; LOOP-NEXT: v_lshlrev_b32_e32 v13, 24, v13
+; LOOP-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; LOOP-NEXT: v_lshlrev_b32_e32 v15, 8, v15
+; LOOP-NEXT: v_lshlrev_b32_e32 v17, 24, v17
+; LOOP-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; LOOP-NEXT: v_lshlrev_b32_e32 v19, 8, v19
; LOOP-NEXT: s_waitcnt vmcnt(12)
-; LOOP-NEXT: v_lshlrev_b32_e32 v19, 24, v19
-; LOOP-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; LOOP-NEXT: v_lshlrev_b32_e32 v21, 24, v21
+; LOOP-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; LOOP-NEXT: s_waitcnt vmcnt(10)
-; LOOP-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; LOOP-NEXT: v_lshlrev_b32_e32 v23, 8, v23
; LOOP-NEXT: s_waitcnt vmcnt(8)
-; LOOP-NEXT: v_lshlrev_b32_e32 v23, 24, v23
-; LOOP-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; LOOP-NEXT: v_lshlrev_b32_e32 v25, 24, v25
+; LOOP-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; LOOP-NEXT: s_waitcnt vmcnt(6)
-; LOOP-NEXT: v_lshlrev_b32_e32 v25, 8, v25
+; LOOP-NEXT: v_lshlrev_b32_e32 v27, 8, v27
; LOOP-NEXT: s_waitcnt vmcnt(4)
-; LOOP-NEXT: v_lshlrev_b32_e32 v28, 24, v28
-; LOOP-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; LOOP-NEXT: v_lshlrev_b32_e32 v30, 24, v30
+; LOOP-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; LOOP-NEXT: s_waitcnt vmcnt(2)
-; LOOP-NEXT: v_lshlrev_b32_e32 v31, 8, v31
+; LOOP-NEXT: v_lshlrev_b32_e32 v33, 8, v33
; LOOP-NEXT: s_waitcnt vmcnt(0)
-; LOOP-NEXT: v_lshlrev_b32_e32 v33, 24, v33
-; LOOP-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; LOOP-NEXT: v_or_b32_e32 v6, v9, v6
-; LOOP-NEXT: v_or_b32_e32 v9, v11, v10
-; LOOP-NEXT: v_or_b32_e32 v7, v13, v7
-; LOOP-NEXT: v_or_b32_e32 v10, v15, v14
-; LOOP-NEXT: v_or_b32_e32 v8, v17, v8
-; LOOP-NEXT: v_or_b32_e32 v11, v19, v18
-; LOOP-NEXT: v_or_b32_e32 v12, v21, v12
-; LOOP-NEXT: v_or_b32_e32 v13, v23, v22
-; LOOP-NEXT: v_or_b32_e32 v14, v25, v16
-; LOOP-NEXT: v_or_b32_e32 v15, v28, v26
-; LOOP-NEXT: v_or_b32_e32 v16, v31, v20
-; LOOP-NEXT: v_or_b32_e32 v17, v33, v32
-; LOOP-NEXT: v_or_b32_e32 v18, v27, v24
-; LOOP-NEXT: v_or_b32_e32 v19, v30, v29
-; LOOP-NEXT: v_or_b32_e32 v6, v9, v6
-; LOOP-NEXT: v_or_b32_e32 v7, v10, v7
+; LOOP-NEXT: v_lshlrev_b32_e32 v35, 24, v35
+; LOOP-NEXT: v_lshlrev_b32_e32 v34, 16, v34
; LOOP-NEXT: v_or_b32_e32 v8, v11, v8
-; LOOP-NEXT: v_or_b32_e32 v9, v13, v12
-; LOOP-NEXT: v_or_b32_e32 v10, v15, v14
-; LOOP-NEXT: v_or_b32_e32 v11, v17, v16
-; LOOP-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; LOOP-NEXT: v_bfe_u32 v13, v18, 8, 8
-; LOOP-NEXT: buffer_store_byte v18, v[34:35], s[0:3], 0 addr64
-; LOOP-NEXT: v_lshrrev_b32_e32 v14, 24, v18
-; LOOP-NEXT: v_lshrrev_b32_e32 v15, 16, v19
-; LOOP-NEXT: v_bfe_u32 v16, v19, 8, 8
-; LOOP-NEXT: buffer_store_byte v19, v[34:35], s[0:3], 0 addr64 offset:4
-; LOOP-NEXT: v_lshrrev_b32_e32 v17, 24, v19
+; LOOP-NEXT: v_or_b32_e32 v11, v13, v12
+; LOOP-NEXT: v_or_b32_e32 v9, v15, v9
+; LOOP-NEXT: v_or_b32_e32 v12, v17, v16
+; LOOP-NEXT: v_or_b32_e32 v10, v19, v10
+; LOOP-NEXT: v_or_b32_e32 v13, v21, v20
+; LOOP-NEXT: v_or_b32_e32 v14, v23, v14
+; LOOP-NEXT: v_or_b32_e32 v15, v25, v24
+; LOOP-NEXT: v_or_b32_e32 v16, v27, v18
+; LOOP-NEXT: v_or_b32_e32 v17, v30, v28
+; LOOP-NEXT: v_or_b32_e32 v18, v33, v22
+; LOOP-NEXT: v_or_b32_e32 v19, v35, v34
+; LOOP-NEXT: v_or_b32_e32 v20, v29, v26
+; LOOP-NEXT: v_or_b32_e32 v21, v32, v31
+; LOOP-NEXT: v_or_b32_e32 v8, v11, v8
+; LOOP-NEXT: v_or_b32_e32 v9, v12, v9
+; LOOP-NEXT: v_or_b32_e32 v10, v13, v10
+; LOOP-NEXT: v_or_b32_e32 v11, v15, v14
+; LOOP-NEXT: v_or_b32_e32 v12, v17, v16
+; LOOP-NEXT: v_or_b32_e32 v13, v19, v18
+; LOOP-NEXT: v_lshrrev_b32_e32 v14, 16, v20
+; LOOP-NEXT: v_bfe_u32 v15, v20, 8, 8
+; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[0:3], 0 addr64
+; LOOP-NEXT: v_lshrrev_b32_e32 v16, 24, v20
+; LOOP-NEXT: v_lshrrev_b32_e32 v17, 16, v21
+; LOOP-NEXT: v_bfe_u32 v18, v21, 8, 8
+; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT: v_lshrrev_b32_e32 v19, 24, v21
; LOOP-NEXT: s_waitcnt expcnt(1)
-; LOOP-NEXT: v_lshrrev_b32_e32 v18, 16, v6
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_bfe_u32 v19, v6, 8, 8
-; LOOP-NEXT: buffer_store_byte v6, v[34:35], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT: v_lshrrev_b32_e32 v20, 16, v8
; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v6, 24, v6
-; LOOP-NEXT: v_lshrrev_b32_e32 v20, 16, v7
-; LOOP-NEXT: v_bfe_u32 v21, v7, 8, 8
-; LOOP-NEXT: buffer_store_byte v7, v[34:35], s[0:3], 0 addr64 offset:12
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v7, 24, v7
-; LOOP-NEXT: v_lshrrev_b32_e32 v22, 16, v8
-; LOOP-NEXT: v_bfe_u32 v23, v8, 8, 8
-; LOOP-NEXT: buffer_store_byte v8, v[34:35], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT: v_bfe_u32 v21, v8, 8, 8
+; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 offset:8
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v8, 24, v8
-; LOOP-NEXT: v_lshrrev_b32_e32 v24, 16, v9
-; LOOP-NEXT: v_bfe_u32 v25, v9, 8, 8
-; LOOP-NEXT: buffer_store_byte v9, v[34:35], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT: v_lshrrev_b32_e32 v22, 16, v9
+; LOOP-NEXT: v_bfe_u32 v23, v9, 8, 8
+; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:12
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v9, 24, v9
-; LOOP-NEXT: v_lshrrev_b32_e32 v26, 16, v10
-; LOOP-NEXT: v_bfe_u32 v27, v10, 8, 8
-; LOOP-NEXT: buffer_store_byte v10, v[34:35], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT: v_lshrrev_b32_e32 v24, 16, v10
+; LOOP-NEXT: v_bfe_u32 v25, v10, 8, 8
+; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:16
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v10, 24, v10
-; LOOP-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; LOOP-NEXT: v_bfe_u32 v29, v11, 8, 8
-; LOOP-NEXT: buffer_store_byte v11, v[34:35], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT: v_lshrrev_b32_e32 v26, 16, v11
+; LOOP-NEXT: v_bfe_u32 v27, v11, 8, 8
+; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:20
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v11, 24, v11
-; LOOP-NEXT: buffer_store_byte v13, v[34:35], s[0:3], 0 addr64 offset:1
-; LOOP-NEXT: buffer_store_byte v12, v[34:35], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT: buffer_store_byte v14, v[34:35], s[0:3], 0 addr64 offset:3
-; LOOP-NEXT: buffer_store_byte v16, v[34:35], s[0:3], 0 addr64 offset:5
-; LOOP-NEXT: buffer_store_byte v15, v[34:35], s[0:3], 0 addr64 offset:6
-; LOOP-NEXT: buffer_store_byte v17, v[34:35], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT: buffer_store_byte v19, v[34:35], s[0:3], 0 addr64 offset:9
-; LOOP-NEXT: buffer_store_byte v18, v[34:35], s[0:3], 0 addr64 offset:10
-; LOOP-NEXT: buffer_store_byte v6, v[34:35], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT: buffer_store_byte v21, v[34:35], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT: buffer_store_byte v20, v[34:35], s[0:3], 0 addr64 offset:14
-; LOOP-NEXT: buffer_store_byte v7, v[34:35], s[0:3], 0 addr64 offset:15
-; LOOP-NEXT: buffer_store_byte v23, v[34:35], s[0:3], 0 addr64 offset:17
-; LOOP-NEXT: buffer_store_byte v22, v[34:35], s[0:3], 0 addr64 offset:18
-; LOOP-NEXT: buffer_store_byte v8, v[34:35], s[0:3], 0 addr64 offset:19
-; LOOP-NEXT: buffer_store_byte v25, v[34:35], s[0:3], 0 addr64 offset:21
-; LOOP-NEXT: buffer_store_byte v24, v[34:35], s[0:3], 0 addr64 offset:22
-; LOOP-NEXT: buffer_store_byte v9, v[34:35], s[0:3], 0 addr64 offset:23
-; LOOP-NEXT: buffer_store_byte v27, v[34:35], s[0:3], 0 addr64 offset:25
-; LOOP-NEXT: buffer_store_byte v26, v[34:35], s[0:3], 0 addr64 offset:26
-; LOOP-NEXT: buffer_store_byte v10, v[34:35], s[0:3], 0 addr64 offset:27
-; LOOP-NEXT: buffer_store_byte v29, v[34:35], s[0:3], 0 addr64 offset:29
-; LOOP-NEXT: buffer_store_byte v28, v[34:35], s[0:3], 0 addr64 offset:30
-; LOOP-NEXT: buffer_store_byte v11, v[34:35], s[0:3], 0 addr64 offset:31
+; LOOP-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; LOOP-NEXT: v_bfe_u32 v29, v12, 8, 8
+; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT: s_waitcnt expcnt(0)
+; LOOP-NEXT: v_lshrrev_b32_e32 v12, 24, v12
+; LOOP-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; LOOP-NEXT: v_bfe_u32 v31, v13, 8, 8
+; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT: s_waitcnt expcnt(0)
+; LOOP-NEXT: v_lshrrev_b32_e32 v13, 24, v13
+; LOOP-NEXT: buffer_store_byte v15, v[6:7], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT: buffer_store_byte v14, v[6:7], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT: buffer_store_byte v16, v[6:7], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT: buffer_store_byte v18, v[6:7], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT: buffer_store_byte v17, v[6:7], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT: buffer_store_byte v19, v[6:7], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT: buffer_store_byte v23, v[6:7], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT: buffer_store_byte v22, v[6:7], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT: buffer_store_byte v25, v[6:7], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT: buffer_store_byte v24, v[6:7], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT: buffer_store_byte v27, v[6:7], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT: buffer_store_byte v26, v[6:7], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:23
+; LOOP-NEXT: buffer_store_byte v29, v[6:7], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT: buffer_store_byte v28, v[6:7], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT: buffer_store_byte v31, v[6:7], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT: buffer_store_byte v30, v[6:7], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:31
; LOOP-NEXT: s_cbranch_vccnz .LBB0_1
; LOOP-NEXT: ; %bb.2: ; %memcpy-split
; LOOP-NEXT: s_mov_b32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 756eb2788607bf..7c6daf769aec28 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2074,208 +2074,208 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7-LABEL: v_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v16, v0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX7-NEXT: v_mov_b32_e32 v17, v1
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT: v_mov_b32_e32 v18, v23
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT: v_mov_b32_e32 v0, v20
-; GFX7-NEXT: v_mov_b32_e32 v1, v23
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v23, v5, v10
-; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX7-NEXT: v_mov_b32_e32 v2, v22
-; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX7-NEXT: v_mul_lo_u32 v12, v3, v12
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
-; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX7-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
-; GFX7-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
-; GFX7-NEXT: v_mul_lo_u32 v10, v16, v15
-; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11
+; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
+; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
+; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
+; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
+; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc
+; GFX7-NEXT: v_mov_b32_e32 v20, v18
+; GFX7-NEXT: v_mov_b32_e32 v18, v19
+; GFX7-NEXT: v_mov_b32_e32 v19, v16
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
+; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
+; GFX7-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
+; GFX7-NEXT: v_mov_b32_e32 v19, v22
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
+; GFX7-NEXT: v_mul_lo_u32 v24, v3, v12
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
+; GFX7-NEXT: v_mul_lo_u32 v22, v2, v13
+; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
+; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13]
+; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
+; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
+; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX7-NEXT: v_mov_b32_e32 v20, v11
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
+; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14
+; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
+; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
+; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, v10
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v16, v0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX8-NEXT: v_mov_b32_e32 v17, v1
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT: v_mov_b32_e32 v18, v23
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT: v_mov_b32_e32 v0, v20
-; GFX8-NEXT: v_mov_b32_e32 v1, v23
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v23, v5, v10
-; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX8-NEXT: v_mov_b32_e32 v2, v22
-; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX8-NEXT: v_mul_lo_u32 v12, v3, v12
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
-; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX8-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
-; GFX8-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
-; GFX8-NEXT: v_mul_lo_u32 v10, v16, v15
-; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11
+; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
+; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
+; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
+; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc
+; GFX8-NEXT: v_mov_b32_e32 v20, v18
+; GFX8-NEXT: v_mov_b32_e32 v18, v19
+; GFX8-NEXT: v_mov_b32_e32 v19, v16
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
+; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
+; GFX8-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
+; GFX8-NEXT: v_mov_b32_e32 v19, v22
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
+; GFX8-NEXT: v_mul_lo_u32 v24, v3, v12
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
+; GFX8-NEXT: v_mul_lo_u32 v22, v2, v13
+; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13]
+; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
+; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
+; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX8-NEXT: v_mov_b32_e32 v20, v11
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
+; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14
+; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
+; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
+; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, v10
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX9-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v24, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT: v_mov_b32_e32 v18, v23
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, v20
-; GFX9-NEXT: v_mov_b32_e32 v1, v23
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10
-; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX9-NEXT: v_mov_b32_e32 v2, v22
-; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX9-NEXT: v_mul_lo_u32 v12, v3, v12
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-NEXT: v_addc_co_u32_e64 v18, s[8:9], 0, v6, s[8:9]
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
-; GFX9-NEXT: v_addc_co_u32_e64 v3, s[8:9], v9, v3, s[8:9]
-; GFX9-NEXT: v_mul_lo_u32 v10, v16, v15
-; GFX9-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX9-NEXT: v_addc_co_u32_e64 v4, s[8:9], v25, v4, s[8:9]
-; GFX9-NEXT: v_addc_co_u32_e64 v5, s[8:9], v18, v5, s[8:9]
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], v21, v6, s[8:9]
-; GFX9-NEXT: v_addc_co_u32_e64 v10, s[8:9], v24, v10, s[8:9]
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v10, v9, s[14:15]
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v13, s[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v12, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v26, s[6:7]
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v9, v23, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v20, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11
+; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
+; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
+; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
+; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v20, vcc
+; GFX9-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-NEXT: v_mov_b32_e32 v18, v19
+; GFX9-NEXT: v_mov_b32_e32 v19, v16
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
+; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
+; GFX9-NEXT: v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
+; GFX9-NEXT: v_mov_b32_e32 v19, v22
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
+; GFX9-NEXT: v_mul_lo_u32 v24, v3, v12
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
+; GFX9-NEXT: v_mul_lo_u32 v22, v2, v13
+; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13]
+; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
+; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
+; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX9-NEXT: v_mov_b32_e32 v20, v11
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13]
+; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14
+; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13]
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, v10
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index e289ee759da158..4bfd29430ff1ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -1962,8 +1962,9 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v17
; GFX6-NEXT: v_min_i32_e32 v17, 0, v3
; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17
-; GFX6-NEXT: v_max_i32_e32 v18, 0, v3
; GFX6-NEXT: v_max_i32_e32 v17, v17, v19
+; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32
+; GFX6-NEXT: v_max_i32_e32 v18, 0, v3
; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v17
@@ -1987,70 +1988,69 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX6-NEXT: v_max_i32_e32 v17, v17, v22
; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
-; GFX6-NEXT: buffer_load_dword v18, off, s[0:3], s32
; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v17
; GFX6-NEXT: v_min_i32_e32 v17, 0, v7
; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17
-; GFX6-NEXT: v_max_i32_e32 v19, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v18, 0, v7
; GFX6-NEXT: v_max_i32_e32 v17, v17, v23
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v16, v19
-; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
-; GFX6-NEXT: v_min_i32_e32 v19, 0, v8
+; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v17
-; GFX6-NEXT: v_max_i32_e32 v17, 0, v8
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT: v_max_i32_e32 v19, v19, v24
-; GFX6-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX6-NEXT: v_min_i32_e32 v19, 0, v9
+; GFX6-NEXT: v_min_i32_e32 v17, 0, v8
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT: v_max_i32_e32 v18, 0, v8
+; GFX6-NEXT: v_max_i32_e32 v17, v17, v24
+; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v17
-; GFX6-NEXT: v_max_i32_e32 v17, 0, v9
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT: v_max_i32_e32 v19, v19, v25
-; GFX6-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX6-NEXT: v_min_i32_e32 v19, 0, v10
+; GFX6-NEXT: v_min_i32_e32 v17, 0, v9
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT: v_max_i32_e32 v18, 0, v9
+; GFX6-NEXT: v_max_i32_e32 v17, v17, v25
+; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17
-; GFX6-NEXT: v_max_i32_e32 v17, 0, v10
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT: v_max_i32_e32 v19, v19, v26
-; GFX6-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX6-NEXT: v_min_i32_e32 v19, 0, v11
+; GFX6-NEXT: v_min_i32_e32 v17, 0, v10
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT: v_max_i32_e32 v18, 0, v10
+; GFX6-NEXT: v_max_i32_e32 v17, v17, v26
+; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v17
-; GFX6-NEXT: v_max_i32_e32 v17, 0, v11
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT: v_max_i32_e32 v19, v19, v27
-; GFX6-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX6-NEXT: v_min_i32_e32 v19, 0, v12
+; GFX6-NEXT: v_min_i32_e32 v17, 0, v11
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT: v_max_i32_e32 v18, 0, v11
+; GFX6-NEXT: v_max_i32_e32 v17, v17, v27
+; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v17
-; GFX6-NEXT: v_max_i32_e32 v17, 0, v12
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT: v_max_i32_e32 v19, v19, v28
-; GFX6-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX6-NEXT: v_min_i32_e32 v19, 0, v13
+; GFX6-NEXT: v_min_i32_e32 v17, 0, v12
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT: v_max_i32_e32 v18, 0, v12
+; GFX6-NEXT: v_max_i32_e32 v17, v17, v28
+; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; GFX6-NEXT: v_max_i32_e32 v17, 0, v13
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT: v_max_i32_e32 v19, v19, v29
-; GFX6-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX6-NEXT: v_min_i32_e32 v19, 0, v14
+; GFX6-NEXT: v_min_i32_e32 v17, 0, v13
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT: v_max_i32_e32 v18, 0, v13
+; GFX6-NEXT: v_max_i32_e32 v17, v17, v29
+; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v17
-; GFX6-NEXT: v_max_i32_e32 v17, 0, v14
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT: v_max_i32_e32 v19, v19, v30
-; GFX6-NEXT: v_min_i32_e32 v17, v19, v17
+; GFX6-NEXT: v_min_i32_e32 v17, 0, v14
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT: v_max_i32_e32 v18, 0, v14
+; GFX6-NEXT: v_max_i32_e32 v17, v17, v30
+; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v17
; GFX6-NEXT: v_max_i32_e32 v17, 0, v15
; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v17
; GFX6-NEXT: v_min_i32_e32 v17, 0, v15
; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_i32_e32 v17, v17, v18
+; GFX6-NEXT: v_max_i32_e32 v17, v17, v19
; GFX6-NEXT: v_min_i32_e32 v16, v17, v16
; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2083,8 +2083,9 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17
; GFX8-NEXT: v_min_i32_e32 v17, 0, v3
; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17
-; GFX8-NEXT: v_max_i32_e32 v18, 0, v3
; GFX8-NEXT: v_max_i32_e32 v17, v17, v19
+; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32
+; GFX8-NEXT: v_max_i32_e32 v18, 0, v3
; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18
; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17
@@ -2108,70 +2109,69 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX8-NEXT: v_max_i32_e32 v17, v17, v22
; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18
; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
-; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17
; GFX8-NEXT: v_min_i32_e32 v17, 0, v7
; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17
-; GFX8-NEXT: v_max_i32_e32 v19, 0, v7
+; GFX8-NEXT: v_max_i32_e32 v18, 0, v7
; GFX8-NEXT: v_max_i32_e32 v17, v17, v23
-; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v16, v19
-; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
-; GFX8-NEXT: v_min_i32_e32 v19, 0, v8
+; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v17
-; GFX8-NEXT: v_max_i32_e32 v17, 0, v8
-; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19
-; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT: v_max_i32_e32 v19, v19, v24
-; GFX8-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX8-NEXT: v_min_i32_e32 v19, 0, v9
+; GFX8-NEXT: v_min_i32_e32 v17, 0, v8
+; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT: v_max_i32_e32 v18, 0, v8
+; GFX8-NEXT: v_max_i32_e32 v17, v17, v24
+; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v17
-; GFX8-NEXT: v_max_i32_e32 v17, 0, v9
-; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19
-; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT: v_max_i32_e32 v19, v19, v25
-; GFX8-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX8-NEXT: v_min_i32_e32 v19, 0, v10
+; GFX8-NEXT: v_min_i32_e32 v17, 0, v9
+; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT: v_max_i32_e32 v18, 0, v9
+; GFX8-NEXT: v_max_i32_e32 v17, v17, v25
+; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v17
-; GFX8-NEXT: v_max_i32_e32 v17, 0, v10
-; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19
-; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT: v_max_i32_e32 v19, v19, v26
-; GFX8-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX8-NEXT: v_min_i32_e32 v19, 0, v11
+; GFX8-NEXT: v_min_i32_e32 v17, 0, v10
+; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT: v_max_i32_e32 v18, 0, v10
+; GFX8-NEXT: v_max_i32_e32 v17, v17, v26
+; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v17
-; GFX8-NEXT: v_max_i32_e32 v17, 0, v11
-; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19
-; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT: v_max_i32_e32 v19, v19, v27
-; GFX8-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX8-NEXT: v_min_i32_e32 v19, 0, v12
+; GFX8-NEXT: v_min_i32_e32 v17, 0, v11
+; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT: v_max_i32_e32 v18, 0, v11
+; GFX8-NEXT: v_max_i32_e32 v17, v17, v27
+; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v17
-; GFX8-NEXT: v_max_i32_e32 v17, 0, v12
-; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19
-; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT: v_max_i32_e32 v19, v19, v28
-; GFX8-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX8-NEXT: v_min_i32_e32 v19, 0, v13
+; GFX8-NEXT: v_min_i32_e32 v17, 0, v12
+; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT: v_max_i32_e32 v18, 0, v12
+; GFX8-NEXT: v_max_i32_e32 v17, v17, v28
+; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v17
-; GFX8-NEXT: v_max_i32_e32 v17, 0, v13
-; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19
-; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT: v_max_i32_e32 v19, v19, v29
-; GFX8-NEXT: v_min_i32_e32 v17, v19, v17
-; GFX8-NEXT: v_min_i32_e32 v19, 0, v14
+; GFX8-NEXT: v_min_i32_e32 v17, 0, v13
+; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT: v_max_i32_e32 v18, 0, v13
+; GFX8-NEXT: v_max_i32_e32 v17, v17, v29
+; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v17
-; GFX8-NEXT: v_max_i32_e32 v17, 0, v14
-; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19
-; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT: v_max_i32_e32 v19, v19, v30
-; GFX8-NEXT: v_min_i32_e32 v17, v19, v17
+; GFX8-NEXT: v_min_i32_e32 v17, 0, v14
+; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT: v_max_i32_e32 v18, 0, v14
+; GFX8-NEXT: v_max_i32_e32 v17, v17, v30
+; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v17
; GFX8-NEXT: v_max_i32_e32 v17, 0, v15
; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v17
; GFX8-NEXT: v_min_i32_e32 v17, 0, v15
; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_i32_e32 v17, v17, v18
+; GFX8-NEXT: v_max_i32_e32 v17, v17, v19
; GFX8-NEXT: v_min_i32_e32 v16, v17, v16
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16
; GFX8-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 14b30e0d79946c..c77438f98b84e6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -429,190 +429,193 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_xor_b32_e32 v5, v0, v9
+; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9
; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9
+; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12
+; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9
; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11
; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0
-; GISEL-NEXT: v_mul_lo_u32 v12, v5, v1
-; GISEL-NEXT: v_mul_hi_u32 v13, v5, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v15, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v12, v5, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v13, 0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v14, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v13, v[11:12]
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v11, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v4
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v11, v12, s[4:5]
-; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5
+; GISEL-NEXT: v_mov_b32_e32 v1, v12
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2]
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5
; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13]
; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5
; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5
; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v6
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc
-; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15
+; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6
+; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13
; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10
-; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
-; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v18, v1
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18
-; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
+; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1
+; GISEL-NEXT: v_trunc_f32_e32 v16, v11
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
+; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7
+; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10
+; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10
+; GISEL-NEXT: v_mov_b32_e32 v1, v12
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4
+; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
+; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18
-; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v6, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v15, v10, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v18, v[1:2]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v21, v19, v[10:11]
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0
-; GISEL-NEXT: v_mul_lo_u32 v11, v19, v10
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc
-; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v16, v18, v10
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_mul_hi_u32 v11, v19, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10
+; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
+; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
+; GISEL-NEXT: v_mov_b32_e32 v0, v11
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1]
+; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1]
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc
+; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11
+; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10
+; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0
+; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11
+; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10
+; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v19, v0
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v10, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
-; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v20, v11, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc
-; GISEL-NEXT: v_ashrrev_i32_e32 v14, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v21, v10, v[8:9]
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14
-; GISEL-NEXT: v_xor_b32_e32 v1, v4, v13
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v14, vcc
-; GISEL-NEXT: v_xor_b32_e32 v9, v2, v14
-; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
-; GISEL-NEXT: v_mul_lo_u32 v4, v10, v8
-; GISEL-NEXT: v_xor_b32_e32 v15, v3, v14
-; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v11, v8
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2
-; GISEL-NEXT: v_mul_hi_u32 v4, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc
+; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1
; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
-; GISEL-NEXT: v_mul_lo_u32 v4, v9, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v11, v15, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10
+; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v4, v9, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2
+; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[0:1]
-; GISEL-NEXT: v_xor_b32_e32 v8, v12, v13
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v13
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v10, v[3:4]
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4]
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
@@ -622,23 +625,23 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8
; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT: v_xor_b32_e32 v4, v14, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT: v_xor_b32_e32 v4, v11, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -1189,123 +1192,123 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_trunc_f32_e32 v8, v5
; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; GISEL-NEXT: v_mov_b32_e32 v8, v5
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
-; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT: v_mov_b32_e32 v9, v5
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13
-; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9
+; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4
; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14
-; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: s_mov_b32 s6, 1
+; GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT: s_subb_u32 s6, 0, 0
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
+; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v17, v18, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
+; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v16
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: s_mov_b32 s6, 1
-; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: s_subb_u32 s6, 0, 0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13
-; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v15, vcc
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0
; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
@@ -1313,34 +1316,34 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v17, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9
; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0
; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5
; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11
; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
@@ -1348,19 +1351,19 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8
+; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9
; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
@@ -1421,178 +1424,178 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT: v_trunc_f32_e32 v7, v5
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
-; CGP-NEXT: v_mov_b32_e32 v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
-; CGP-NEXT: v_mul_hi_u32 v12, v9, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11]
-; CGP-NEXT: v_mul_lo_u32 v10, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v11, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v8, v13
-; CGP-NEXT: v_mul_lo_u32 v7, v9, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v8, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT: v_trunc_f32_e32 v8, v5
+; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
+; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; CGP-NEXT: v_mov_b32_e32 v9, v5
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
+; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; CGP-NEXT: v_xor_b32_e32 v15, v0, v7
-; CGP-NEXT: v_mul_lo_u32 v0, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v14
-; CGP-NEXT: v_xor_b32_e32 v18, v1, v7
-; CGP-NEXT: v_mul_hi_u32 v1, v16, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v17, v13
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v1, v17, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CGP-NEXT: v_mul_hi_u32 v4, v16, v14
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13
+; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
+; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v18, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
-; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v18, v0
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
+; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v16, v18, v1
+; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v17, v18, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13
+; CGP-NEXT: v_mul_hi_u32 v16, v19, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
-; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
+; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v16
-; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15
+; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
; CGP-NEXT: v_mov_b32_e32 v0, v5
; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1]
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v15
-; CGP-NEXT: v_mul_lo_u32 v19, v8, v0
+; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17
; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v1, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
+; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc
; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v9, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; CGP-NEXT: v_mul_hi_u32 v10, v8, v0
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_mul_lo_u32 v10, v8, v0
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; CGP-NEXT: v_mul_hi_u32 v5, v7, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
+; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v17, v13, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6]
+; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v9, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v8, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v9, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v9, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc
; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
@@ -1851,6 +1854,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v5, vcc
; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11
; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7
+; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10
; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7
; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7
; GISEL-NEXT: v_trunc_f32_e32 v13, v11
@@ -1861,22 +1865,22 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_mov_b32_e32 v7, v12
; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11
-; GISEL-NEXT: v_mul_hi_u32 v18, v14, v11
; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v18, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v18, v11
+; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7
+; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18
; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
@@ -1891,24 +1895,24 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v13, v0, v7
+; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7
; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT: v_mul_lo_u32 v15, v14, v12
+; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7
; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11
; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v15, v14, v12
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1917,164 +1921,166 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0
-; GISEL-NEXT: v_mul_lo_u32 v12, v13, v1
-; GISEL-NEXT: v_mul_hi_u32 v14, v13, v0
+; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0
; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v1
+; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v14, v16, v1
+; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v14, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v15, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v14, v[11:12]
-; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v16, v11, vcc
-; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v16, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v6, v5, vcc
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v13, v11, v12, s[4:5]
-; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v1, vcc
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v1, v12
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2]
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13]
; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6
; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6
; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10
-; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v14
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v15, vcc
-; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12
+; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9
+; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13
; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8
-; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
-; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v18, v1
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18
-; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
+; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1
+; GISEL-NEXT: v_trunc_f32_e32 v16, v11
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
+; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10
+; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8
+; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8
+; GISEL-NEXT: v_mov_b32_e32 v1, v12
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
+; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v10
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18
-; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v9, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v18, v[1:2]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v21, v19, v[11:12]
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8
+; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7]
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
+; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11
+; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0
-; GISEL-NEXT: v_mul_lo_u32 v12, v19, v11
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc
-; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v16, v18, v11
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT: v_mul_hi_u32 v12, v19, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12
-; GISEL-NEXT: v_mul_hi_u32 v11, v18, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1]
+; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1]
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc
+; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12
+; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11
+; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12
+; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11
+; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v0
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v18, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v11, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v12, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v13, v7
-; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v11, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc
-; GISEL-NEXT: v_xor_b32_e32 v5, v2, v13
-; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v12, v4
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v14, v2
-; GISEL-NEXT: v_mul_hi_u32 v14, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v14
-; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc
+; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
-; GISEL-NEXT: v_mul_lo_u32 v4, v5, v2
-; GISEL-NEXT: v_mul_hi_u32 v11, v5, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v12, v15, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v15, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v4, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v2
+; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1]
; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7
; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4]
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
@@ -2085,7 +2091,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10
@@ -2099,8 +2105,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT: v_xor_b32_e32 v4, v13, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -2661,16 +2667,16 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GISEL-NEXT: v_mul_lo_u32 v6, v13, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v1, vcc
; GISEL-NEXT: v_mul_hi_u32 v1, v11, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_mul_lo_u32 v6, v13, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 5f568839a28dd3..40f29c56c8f127 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -1537,36 +1537,36 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1]
; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2
; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
-; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v8
-; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12
-; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12
-; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s2, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1]
-; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13
-; GFX8-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1]
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
-; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
+; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
+; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14
+; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11
+; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v2, v6, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX8-NEXT: v_xor_b32_e32 v2, s0, v8
-; GFX8-NEXT: v_xor_b32_e32 v3, s1, v9
-; GFX8-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6
+; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8
+; GFX8-NEXT: v_mov_b32_e32 v6, s1
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9
; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7
; GFX8-NEXT: v_mov_b32_e32 v8, s6
; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6
@@ -1635,7 +1635,6 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s9
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
@@ -1680,206 +1679,206 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v5, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v4, v3, v0, v6
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[0:1]
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
; GFX9-NEXT: v_mov_b32_e32 v6, s11
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s10, v1
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[2:3]
+; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v4, s9
; GFX9-NEXT: s_ashr_i32 s10, s3, 31
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc
-; GFX9-NEXT: v_sub_u32_e32 v1, s11, v2
+; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
+; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1]
-; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v12, s[0:1], 0, v4, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v13, v2, v13, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v3
-; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v12, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1]
+; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
; GFX9-NEXT: s_add_u32 s0, s18, s6
; GFX9-NEXT: s_addc_u32 s1, s19, s6
; GFX9-NEXT: s_add_u32 s2, s2, s10
; GFX9-NEXT: s_mov_b32 s11, s10
; GFX9-NEXT: s_addc_u32 s3, s3, s10
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v16, s2
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
-; GFX9-NEXT: v_add_f32_e32 v2, v2, v16
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v2
-; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
-; GFX9-NEXT: v_trunc_f32_e32 v17, v2
-; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v17
-; GFX9-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v1
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX9-NEXT: v_add_f32_e32 v1, v1, v15
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9
+; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX9-NEXT: v_trunc_f32_e32 v16, v1
+; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16
+; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0
; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
; GFX9-NEXT: s_sub_u32 s5, 0, s2
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v18, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v14, vcc
-; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v17
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc
+; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16
; GFX9-NEXT: s_subb_u32 s20, 0, s3
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v14, v[2:3]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v4, v12, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s20, v18, v[2:3]
-; GFX9-NEXT: v_mul_lo_u32 v3, v14, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v16, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2
-; GFX9-NEXT: v_mul_hi_u32 v11, v18, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v14, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1
+; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v11, v14, v2
-; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_mul_hi_u32 v4, v18, v2
-; GFX9-NEXT: v_mul_hi_u32 v2, v14, v2
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v11, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
-; GFX9-NEXT: v_add_u32_e32 v4, v11, v4
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1
+; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v18, v1
-; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[18:19], s5, v11, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v12, v[1:2]
-; GFX9-NEXT: v_xor_b32_e32 v8, s16, v5
-; GFX9-NEXT: v_xor_b32_e32 v9, s17, v9
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s20, v11, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v10, s17
-; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s16, v8
-; GFX9-NEXT: v_xor_b32_e32 v5, s4, v7
-; GFX9-NEXT: v_mul_lo_u32 v7, v12, v3
-; GFX9-NEXT: v_mul_lo_u32 v8, v11, v4
-; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v9, v10, vcc
-; GFX9-NEXT: v_mul_hi_u32 v9, v11, v3
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v9, v12, v4
-; GFX9-NEXT: v_mul_hi_u32 v3, v12, v3
-; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT: v_mul_hi_u32 v8, v11, v4
-; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v9, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT: v_add_u32_e32 v8, v9, v8
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_add_u32_e32 v3, v10, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0
+; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1]
+; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5
+; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v9, s17
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5
+; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7
+; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2
+; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc
+; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v4, v8, v7, v4
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc
-; GFX9-NEXT: v_mul_lo_u32 v7, s9, v3
-; GFX9-NEXT: v_mul_lo_u32 v8, s8, v4
-; GFX9-NEXT: v_mul_hi_u32 v10, s8, v3
-; GFX9-NEXT: v_mul_hi_u32 v3, s9, v3
-; GFX9-NEXT: v_mul_hi_u32 v12, s9, v4
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3
+; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2
+; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3
+; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v10, s9, v4
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT: v_mul_hi_u32 v8, s8, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
+; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2
+; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3
+; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2
+; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2
+; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3
+; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3
; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v7
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0
-; GFX9-NEXT: v_mov_b32_e32 v9, s4
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s4, v5
-; GFX9-NEXT: v_add_u32_e32 v8, v10, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
-; GFX9-NEXT: v_add3_u32 v9, v8, v7, v12
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v10, s9
-; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s8, v3
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v11, v[7:8]
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8
-; GFX9-NEXT: v_sub_u32_e32 v7, s9, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s2, v3
-; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v7, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11
-; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v13
-; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v12
-; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v7, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v13
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s2, v12
-; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14
-; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1]
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v18, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v11, v14, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, s4
+; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
+; GFX9-NEXT: v_add_u32_e32 v6, v9, v7
+; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4]
+; GFX9-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7
+; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12
+; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12
+; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14
+; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
+; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11
+; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX9-NEXT: v_xor_b32_e32 v3, s0, v10
-; GFX9-NEXT: v_xor_b32_e32 v4, s1, v9
-; GFX9-NEXT: v_mov_b32_e32 v9, s1
-; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v9, vcc
+; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6
+; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8
+; GFX9-NEXT: v_mov_b32_e32 v6, s1
+; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9
+; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7
-; GFX9-NEXT: v_xor_b32_e32 v8, s6, v8
-; GFX9-NEXT: v_mov_b32_e32 v9, s6
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v9, vcc
-; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[12:13]
-; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[14:15]
+; GFX9-NEXT: v_mov_b32_e32 v8, s6
+; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc
+; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13]
+; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdivrem_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index ee7a040e41fd5e..bb8f3cd6990f88 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -419,24 +419,24 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT: v_xor_b32_e32 v11, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4
; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9
-; GISEL-NEXT: v_mul_lo_u32 v13, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10
; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4
; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v13, v12, v10
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -445,190 +445,191 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc
; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0
-; GISEL-NEXT: v_mul_lo_u32 v10, v11, v1
-; GISEL-NEXT: v_mul_hi_u32 v12, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1
+; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0
; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v1
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v14, v1
+; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v9
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v12, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0
+; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10
+; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10
+; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
+; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15
; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2]
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v12, v[9:10]
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v11, v0
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10]
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1
+; GISEL-NEXT: v_trunc_f32_e32 v12, v10
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, v11
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1]
; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v14, v9
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v8, vcc
-; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v7
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v0
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc
-; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0
-; GISEL-NEXT: v_xor_b32_e32 v7, v7, v0
-; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7
-; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v9, vcc
-; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8
-; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v1, s[4:5]
-; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v16, v1
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v16
-; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0
-; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6
-; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v7, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v9, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v16, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v17, v[8:9]
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v20, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v8
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1]
+; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9
+; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
+; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7]
+; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5
+; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7]
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
-; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v14, v16, v8
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_mul_hi_u32 v13, v17, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v0
-; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v16, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v13, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v14, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v13, v[8:9]
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v11, v2, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v13, v8
-; GISEL-NEXT: v_xor_b32_e32 v12, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18
+; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9]
+; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5
+; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v8
+; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_mul_hi_u32 v9, v13, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2
-; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2
+; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v13, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[8:9]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6
-; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6
+; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v7
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6
; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64:
@@ -1117,93 +1118,96 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_trunc_f32_e32 v8, v5
; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; GISEL-NEXT: v_mov_b32_e32 v8, v5
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
-; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT: v_mov_b32_e32 v9, v5
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13
-; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9
+; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4
; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14
-; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: s_mov_b32 s6, 1
+; GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT: s_subb_u32 s6, 0, 0
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
+; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v0
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc
+; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
+; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
@@ -1216,22 +1220,19 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: s_mov_b32 s6, 1
-; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: s_subb_u32 s6, 0, 0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0
; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
@@ -1239,34 +1240,34 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9
; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0
; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5
; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11
; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
@@ -1274,19 +1275,19 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8
+; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9
; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
@@ -1345,96 +1346,96 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT: v_trunc_f32_e32 v7, v5
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
-; CGP-NEXT: v_mov_b32_e32 v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
-; CGP-NEXT: v_mul_hi_u32 v12, v9, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11]
-; CGP-NEXT: v_mul_lo_u32 v10, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v11, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v8, v13
-; CGP-NEXT: v_mul_lo_u32 v7, v9, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v8, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT: v_trunc_f32_e32 v8, v5
+; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
+; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; CGP-NEXT: v_mov_b32_e32 v9, v5
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
+; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; CGP-NEXT: v_xor_b32_e32 v15, v0, v7
-; CGP-NEXT: v_mul_lo_u32 v0, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v14
-; CGP-NEXT: v_xor_b32_e32 v18, v1, v7
-; CGP-NEXT: v_mul_hi_u32 v1, v16, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v17, v13
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v1, v17, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CGP-NEXT: v_mul_hi_u32 v4, v16, v14
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13
+; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
+; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v18, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
-; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v18, v0
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
+; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x1000
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v16, v18, v1
+; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v16, v18, v1
+; CGP-NEXT: v_mul_hi_u32 v15, v19, v1
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v16, v13
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, v15, v0
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc
+; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
+; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
@@ -1443,78 +1444,78 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
; CGP-NEXT: v_mov_b32_e32 v0, v5
; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1]
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT: v_mul_lo_u32 v19, v8, v0
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
+; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v9, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; CGP-NEXT: v_mul_hi_u32 v10, v8, v0
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_mul_lo_u32 v10, v8, v0
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; CGP-NEXT: v_mul_hi_u32 v5, v7, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
+; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v16, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6]
+; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc
+; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v9, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v8, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v9, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v9, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc
; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
@@ -1710,93 +1711,96 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_trunc_f32_e32 v8, v5
; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; GISEL-NEXT: v_mov_b32_e32 v8, v5
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
-; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT: v_mov_b32_e32 v9, v5
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13
-; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9
+; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4
; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14
-; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: s_mov_b32 s6, 1
+; GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT: s_subb_u32 s6, 0, 0
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
+; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v0
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc
+; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
+; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
@@ -1809,22 +1813,19 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: s_mov_b32 s6, 1
-; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: s_subb_u32 s6, 0, 0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0
; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
@@ -1832,34 +1833,34 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9
; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0
; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5
; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11
; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
@@ -1867,19 +1868,19 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8
+; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9
; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
@@ -1938,96 +1939,96 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT: v_trunc_f32_e32 v7, v5
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
-; CGP-NEXT: v_mov_b32_e32 v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
-; CGP-NEXT: v_mul_hi_u32 v12, v9, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11]
-; CGP-NEXT: v_mul_lo_u32 v10, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v11, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v8, v13
-; CGP-NEXT: v_mul_lo_u32 v7, v9, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v8, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT: v_trunc_f32_e32 v8, v5
+; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
+; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; CGP-NEXT: v_mov_b32_e32 v9, v5
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
+; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; CGP-NEXT: v_xor_b32_e32 v15, v0, v7
-; CGP-NEXT: v_mul_lo_u32 v0, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v14
-; CGP-NEXT: v_xor_b32_e32 v18, v1, v7
-; CGP-NEXT: v_mul_hi_u32 v1, v16, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v17, v13
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v1, v17, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CGP-NEXT: v_mul_hi_u32 v4, v16, v14
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13
+; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
+; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v18, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
-; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v18, v0
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
+; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v16, v18, v1
+; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v16, v18, v1
+; CGP-NEXT: v_mul_hi_u32 v15, v19, v1
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v16, v13
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, v15, v0
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc
+; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
+; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
@@ -2036,78 +2037,78 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
; CGP-NEXT: v_mov_b32_e32 v0, v5
; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1]
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT: v_mul_lo_u32 v19, v8, v0
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
+; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v9, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; CGP-NEXT: v_mul_hi_u32 v10, v8, v0
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_mul_lo_u32 v10, v8, v0
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; CGP-NEXT: v_mul_hi_u32 v5, v7, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
+; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v16, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6]
+; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc
+; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v9, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v8, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v9, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v9, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc
; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
@@ -2350,7 +2351,6 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_mov_b32_e32 v8, 0x1000
; GISEL-NEXT: v_mov_b32_e32 v9, 0
; GISEL-NEXT: v_lshl_b64 v[4:5], v[8:9], v4
-; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], v6
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v5, v7, vcc
@@ -2425,172 +2425,175 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v15, v0
-; GISEL-NEXT: v_mul_lo_u32 v11, v12, v1
-; GISEL-NEXT: v_mul_hi_u32 v13, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v15, v1
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v10
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v10, v[1:2]
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v13, v[10:11]
-; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v0
-; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v15, v10, vcc
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v10
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7
-; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v0, v7, vcc
-; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v13, v1, v6, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v9, v0, vcc
-; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v8, v0
-; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v8
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v11, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v15, s[4:5], 0, v10, vcc
-; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v7
-; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v16, v9, v1, s[4:5]
-; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v9, v1
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0
-; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6
-; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v8, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v20, v9
-; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v20, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v14, v5
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v17, v[9:10]
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v20, v0
-; GISEL-NEXT: v_mul_lo_u32 v10, v17, v9
-; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc
-; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10
+; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v14, v20, v9
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6
+; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6
+; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0
+; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8
+; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8
+; GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1]
+; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10
+; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1
+; GISEL-NEXT: v_trunc_f32_e32 v13, v10
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_mov_b32_e32 v1, v11
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10
+; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12]
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7]
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5
+; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7]
+; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v17, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT: v_mul_hi_u32 v9, v20, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
+; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v0
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v20, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v14, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v15, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v14, v[9:10]
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v11, v2, v7
-; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0
-; GISEL-NEXT: v_mul_lo_u32 v10, v14, v9
-; GISEL-NEXT: v_xor_b32_e32 v12, v3, v7
-; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4
+; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10]
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5
+; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
+; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9
+; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v15, v9
+; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v11, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v12, v2
+; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
-; GISEL-NEXT: v_mul_hi_u32 v9, v11, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v13, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v13, v[9:10]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5]
; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6
; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8
@@ -2605,13 +2608,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
@@ -3030,33 +3033,34 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1
-; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3
; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4
; GISEL-NEXT: v_trunc_f32_e32 v9, v7
-; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v4
; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9
; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_mov_b32_e32 v5, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT: v_mov_b32_e32 v4, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v7
; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7
; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4
; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
@@ -3064,215 +3068,214 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v4
; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc
; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_mov_b32_e32 v5, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT: v_mov_b32_e32 v4, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v7
; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0
; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7
; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc
+; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v13, v4, vcc
; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4
+; GISEL-NEXT: v_and_b32_e32 v9, 0xffffff, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0
; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7
-; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v7, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v6
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v5, v[0:1]
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
-; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v0
-; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v2
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v6, v9, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v5, v3, vcc
-; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v7
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[0:1]
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 0, v9
+; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v4
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v10, v[7:8]
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v11, v6
+; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v5
; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v7, v1
-; GISEL-NEXT: v_subbrev_u32_e64 v13, s[4:5], 0, v10, vcc
+; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v3, v7, vcc
+; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v3, v7
; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v6, v4
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0
-; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v2
-; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, v3, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v6
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v3
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v17, v[0:1]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v0
+; GISEL-NEXT: v_trunc_f32_e32 v7, v5
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v0
+; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], 0, v4
+; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, v3, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
+; GISEL-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v14, v[0:1]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v14, v[5:6]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v6, v18, v0, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v4
-; GISEL-NEXT: v_mul_lo_u32 v18, v14, v5
-; GISEL-NEXT: v_mul_hi_u32 v19, v14, v4
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v11, v[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v7, v15, v0, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v0, v14, v5
+; GISEL-NEXT: v_mul_lo_u32 v15, v11, v6
+; GISEL-NEXT: v_mul_hi_u32 v16, v11, v5
; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v10, v3, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v19
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v16
+; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], v8, v1
+; GISEL-NEXT: v_subbrev_u32_e64 v17, s[6:7], 0, v10, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v17, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v16, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v17, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, v18, s[6:7]
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v19, v17, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
-; GISEL-NEXT: v_mul_hi_u32 v18, v14, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v19, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT: v_mul_lo_u32 v15, v14, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v15, v5
+; GISEL-NEXT: v_mul_hi_u32 v15, v11, v6
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18
-; GISEL-NEXT: v_mul_hi_u32 v5, v17, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v18, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
-; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v11, v1
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v17, v[0:1]
-; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v18, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v17, v4
-; GISEL-NEXT: v_mul_lo_u32 v6, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v9, v14, v4
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 0, v12
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_mul_hi_u32 v6, v14, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v15
+; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v3, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v17, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v15, v5
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v0
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v16, v1
+; GISEL-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v14, v[0:1]
+; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v16, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5
+; GISEL-NEXT: v_mul_lo_u32 v7, v11, v0
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 0, v2
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v17, v0, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4
-; GISEL-NEXT: v_mul_lo_u32 v9, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v14, v0, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v5
+; GISEL-NEXT: v_mul_lo_u32 v9, v2, v6
; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT: v_mul_hi_u32 v1, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_mul_hi_u32 v1, v2, v5
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v2, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v3, v5
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v1
+; GISEL-NEXT: v_mul_hi_u32 v10, v3, v6
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v1
-; GISEL-NEXT: v_mov_b32_e32 v1, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[1:2]
-; GISEL-NEXT: v_subrev_i32_e32 v1, vcc, 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
-; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v1
+; GISEL-NEXT: v_mov_b32_e32 v1, v6
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v7, v[1:2]
+; GISEL-NEXT: v_subrev_i32_e32 v1, vcc, 0, v8
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v6, vcc
+; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v3, v6
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3
+; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
-; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v4
+; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4
; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2
; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0, v3
; GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 43ebe156eb2a28..5673a6c6e869d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -1965,8 +1965,9 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v17
; GFX6-NEXT: v_max_i32_e32 v17, -1, v3
; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31
-; GFX6-NEXT: v_min_i32_e32 v18, -1, v3
; GFX6-NEXT: v_max_i32_e32 v17, v17, v19
+; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32
+; GFX6-NEXT: v_min_i32_e32 v18, -1, v3
; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16
; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v17
@@ -1990,70 +1991,69 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX6-NEXT: v_max_i32_e32 v17, v17, v22
; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16
; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
-; GFX6-NEXT: buffer_load_dword v18, off, s[0:3], s32
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v17
; GFX6-NEXT: v_max_i32_e32 v17, -1, v7
; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31
-; GFX6-NEXT: v_min_i32_e32 v19, -1, v7
+; GFX6-NEXT: v_min_i32_e32 v18, -1, v7
; GFX6-NEXT: v_max_i32_e32 v17, v17, v23
-; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16
-; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v17
; GFX6-NEXT: v_max_i32_e32 v17, -1, v8
; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31
-; GFX6-NEXT: v_min_i32_e32 v19, -1, v8
-; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT: v_min_i32_e32 v18, -1, v8
; GFX6-NEXT: v_max_i32_e32 v17, v17, v24
-; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v17
; GFX6-NEXT: v_max_i32_e32 v17, -1, v9
; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31
-; GFX6-NEXT: v_min_i32_e32 v19, -1, v9
-; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT: v_min_i32_e32 v18, -1, v9
; GFX6-NEXT: v_max_i32_e32 v17, v17, v25
-; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17
; GFX6-NEXT: v_max_i32_e32 v17, -1, v10
; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31
-; GFX6-NEXT: v_min_i32_e32 v19, -1, v10
-; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT: v_min_i32_e32 v18, -1, v10
; GFX6-NEXT: v_max_i32_e32 v17, v17, v26
-; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17
; GFX6-NEXT: v_max_i32_e32 v17, -1, v11
; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31
-; GFX6-NEXT: v_min_i32_e32 v19, -1, v11
-; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT: v_min_i32_e32 v18, -1, v11
; GFX6-NEXT: v_max_i32_e32 v17, v17, v27
-; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17
; GFX6-NEXT: v_max_i32_e32 v17, -1, v12
; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31
-; GFX6-NEXT: v_min_i32_e32 v19, -1, v12
-; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT: v_min_i32_e32 v18, -1, v12
; GFX6-NEXT: v_max_i32_e32 v17, v17, v28
-; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v17
; GFX6-NEXT: v_max_i32_e32 v17, -1, v13
; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31
-; GFX6-NEXT: v_min_i32_e32 v19, -1, v13
-; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT: v_min_i32_e32 v18, -1, v13
; GFX6-NEXT: v_max_i32_e32 v17, v17, v29
-; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v17
; GFX6-NEXT: v_max_i32_e32 v17, -1, v14
; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31
-; GFX6-NEXT: v_min_i32_e32 v19, -1, v14
-; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT: v_min_i32_e32 v18, -1, v14
; GFX6-NEXT: v_max_i32_e32 v17, v17, v30
-; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT: v_min_i32_e32 v17, v17, v18
; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v17
; GFX6-NEXT: v_max_i32_e32 v17, -1, v15
; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31
-; GFX6-NEXT: v_min_i32_e32 v19, -1, v15
-; GFX6-NEXT: v_add_i32_e32 v16, vcc, v19, v16
+; GFX6-NEXT: v_min_i32_e32 v18, -1, v15
+; GFX6-NEXT: v_add_i32_e32 v16, vcc, v18, v16
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_i32_e32 v17, v17, v18
+; GFX6-NEXT: v_max_i32_e32 v17, v17, v19
; GFX6-NEXT: v_min_i32_e32 v16, v17, v16
; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2086,8 +2086,9 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v17
; GFX8-NEXT: v_max_i32_e32 v17, -1, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31
-; GFX8-NEXT: v_min_i32_e32 v18, -1, v3
; GFX8-NEXT: v_max_i32_e32 v17, v17, v19
+; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32
+; GFX8-NEXT: v_min_i32_e32 v18, -1, v3
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16
; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v17
@@ -2111,70 +2112,69 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX8-NEXT: v_max_i32_e32 v17, v17, v22
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16
; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
-; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v17
; GFX8-NEXT: v_max_i32_e32 v17, -1, v7
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31
-; GFX8-NEXT: v_min_i32_e32 v19, -1, v7
+; GFX8-NEXT: v_min_i32_e32 v18, -1, v7
; GFX8-NEXT: v_max_i32_e32 v17, v17, v23
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16
-; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v17
; GFX8-NEXT: v_max_i32_e32 v17, -1, v8
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31
-; GFX8-NEXT: v_min_i32_e32 v19, -1, v8
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT: v_min_i32_e32 v18, -1, v8
; GFX8-NEXT: v_max_i32_e32 v17, v17, v24
-; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v17
; GFX8-NEXT: v_max_i32_e32 v17, -1, v9
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31
-; GFX8-NEXT: v_min_i32_e32 v19, -1, v9
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT: v_min_i32_e32 v18, -1, v9
; GFX8-NEXT: v_max_i32_e32 v17, v17, v25
-; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v17
; GFX8-NEXT: v_max_i32_e32 v17, -1, v10
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31
-; GFX8-NEXT: v_min_i32_e32 v19, -1, v10
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT: v_min_i32_e32 v18, -1, v10
; GFX8-NEXT: v_max_i32_e32 v17, v17, v26
-; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v17
; GFX8-NEXT: v_max_i32_e32 v17, -1, v11
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31
-; GFX8-NEXT: v_min_i32_e32 v19, -1, v11
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT: v_min_i32_e32 v18, -1, v11
; GFX8-NEXT: v_max_i32_e32 v17, v17, v27
-; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v17
; GFX8-NEXT: v_max_i32_e32 v17, -1, v12
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31
-; GFX8-NEXT: v_min_i32_e32 v19, -1, v12
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT: v_min_i32_e32 v18, -1, v12
; GFX8-NEXT: v_max_i32_e32 v17, v17, v28
-; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v17
; GFX8-NEXT: v_max_i32_e32 v17, -1, v13
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31
-; GFX8-NEXT: v_min_i32_e32 v19, -1, v13
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT: v_min_i32_e32 v18, -1, v13
; GFX8-NEXT: v_max_i32_e32 v17, v17, v29
-; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v17
; GFX8-NEXT: v_max_i32_e32 v17, -1, v14
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31
-; GFX8-NEXT: v_min_i32_e32 v19, -1, v14
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT: v_min_i32_e32 v18, -1, v14
; GFX8-NEXT: v_max_i32_e32 v17, v17, v30
-; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT: v_min_i32_e32 v17, v17, v18
; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v17
; GFX8-NEXT: v_max_i32_e32 v17, -1, v15
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31
-; GFX8-NEXT: v_min_i32_e32 v19, -1, v15
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, v19, v16
+; GFX8-NEXT: v_min_i32_e32 v18, -1, v15
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, v18, v16
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_i32_e32 v17, v17, v18
+; GFX8-NEXT: v_max_i32_e32 v17, v17, v19
; GFX8-NEXT: v_min_i32_e32 v16, v17, v16
; GFX8-NEXT: v_sub_u32_e32 v15, vcc, v15, v16
; GFX8-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 1ee521b3dedac1..f5a901b024ef52 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -365,256 +365,256 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-LABEL: v_udiv_v2i64:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7
-; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6
-; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc
-; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5]
-; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11
-; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13
-; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10
-; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11
-; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10
-; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11
-; GISEL-NEXT: v_trunc_f32_e32 v13, v13
-; GISEL-NEXT: v_trunc_f32_e32 v14, v14
-; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10
-; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14
-; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18
-; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20
-; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10
-; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16
-; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19
-; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16
-; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21
-; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19
-; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21
-; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19
-; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18
-; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16
-; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19
-; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17
-; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18
-; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16
-; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17
-; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11]
-; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15]
-; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22
-; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23
-; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17]
-; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20
-; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22
-; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19
-; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16
-; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17
-; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19
-; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10
-; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18
-; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19
-; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11
-; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11
-; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20
-; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8
-; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
-; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19
-; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16
-; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5
+; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8
+; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8
+; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8
+; GISEL-NEXT: v_trunc_f32_e32 v9, v9
+; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8
+; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v15, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12
+; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13
+; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17
-; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9
-; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17
-; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9
-; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11]
-; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19
-; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10
+; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v14, v8, v10
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17
-; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18
-; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19
-; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10
-; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10
-; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11
-; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11
-; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12
-; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc
-; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8
-; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8
-; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8
-; GISEL-NEXT: v_mul_hi_u32 v15, v1, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, v2, v9
-; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12
-; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9
-; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10
-; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v18, v8
-; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8
+; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9
+; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v19
-; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9]
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8
+; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v14, v4, v8
-; GISEL-NEXT: v_mul_lo_u32 v16, v5, v8
-; GISEL-NEXT: v_mul_hi_u32 v17, v4, v8
-; GISEL-NEXT: v_mul_lo_u32 v18, v6, v9
-; GISEL-NEXT: v_mul_lo_u32 v19, v7, v9
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v11, v6, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8
+; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8
+; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9
+; GISEL-NEXT: v_mul_hi_u32 v13, v4, v8
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v11, vcc
+; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v4
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v8
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v11
+; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v12, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7
+; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
+; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4
+; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10
+; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11
+; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v8
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14
-; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9
-; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18
-; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13
-; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10
-; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14
-; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v6
-; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v4
-; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v20, v4, v10
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v4
-; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc
-; GISEL-NEXT: v_mul_lo_u32 v4, v6, v12
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
-; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], 0, v12, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[14:15]
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[16:17]
-; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v19, v4
-; GISEL-NEXT: v_addc_u32_e64 v19, s[6:7], 0, v0, s[10:11]
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17
-; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v2, s[12:13]
-; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v11
-; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], v1, v16, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v16
-; GISEL-NEXT: v_subb_u32_e64 v16, s[6:7], v3, v4, s[8:9]
-; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[22:23]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v5
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v11, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11
+; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10
+; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4
+; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v5, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8
+; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10
+; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5
+; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v10, v6, v5
+; GISEL-NEXT: v_mul_hi_u32 v11, v6, v4
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v9, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v6
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v4
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v7
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[8:9]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v6, v16, v6, s[10:11]
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[18:19]
-; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21]
-; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v20, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, v5
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v3, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[8:9]
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v13, v18, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v19, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v2, v17, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v9
+; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v10, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v6, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_udiv_v2i64:
@@ -1252,256 +1252,256 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_mov_b32_e32 v10, 0
; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4
; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5
-; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4
-; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc
-; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5]
-; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11
-; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13
-; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10
-; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11
-; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10
-; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11
-; GISEL-NEXT: v_trunc_f32_e32 v13, v13
-; GISEL-NEXT: v_trunc_f32_e32 v14, v14
-; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10
-; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14
-; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18
-; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20
-; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16
-; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19
-; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16
-; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21
-; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19
-; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21
-; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19
-; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18
-; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16
-; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19
-; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17
-; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18
-; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16
-; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17
-; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11]
-; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15]
-; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22
-; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23
-; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17]
-; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20
-; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22
-; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19
-; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16
-; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17
-; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19
-; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18
-; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19
-; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11
-; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11
-; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20
-; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
-; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19
-; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16
-; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7
+; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8
+; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6
+; GISEL-NEXT: v_trunc_f32_e32 v9, v9
+; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6
+; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6
+; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v15, v10, v6
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12
+; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13
+; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v16, v6, v13
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17
-; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6
-; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9
-; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17
-; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9
-; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11]
-; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19
-; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12
+; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10
+; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17
-; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18
-; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19
-; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10
-; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10
-; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11
-; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11
-; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12
-; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15
-; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc
-; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6
-; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6
-; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6
-; GISEL-NEXT: v_mul_hi_u32 v15, v1, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v2, v9
-; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12
-; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9
-; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10
-; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v18, v6
-; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9
+; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v19
-; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9]
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9
+; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6
+; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v14, v7, v6
-; GISEL-NEXT: v_mul_lo_u32 v16, v8, v6
-; GISEL-NEXT: v_mul_hi_u32 v17, v7, v6
-; GISEL-NEXT: v_mul_lo_u32 v18, v4, v9
-; GISEL-NEXT: v_mul_lo_u32 v19, v5, v9
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT: v_mul_lo_u32 v12, v7, v9
+; GISEL-NEXT: v_mul_hi_u32 v13, v7, v6
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v11, vcc
+; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v7
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v6
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v11
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5
+; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6
+; GISEL-NEXT: v_trunc_f32_e32 v7, v7
+; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
+; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v6
+; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v6
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10
+; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11
+; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v6
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14
-; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9
-; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18
-; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13
-; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10
-; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14
-; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v7
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v4
-; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v7
-; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v4
-; GISEL-NEXT: v_mul_lo_u32 v20, v7, v10
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v7
-; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
-; GISEL-NEXT: v_mul_lo_u32 v2, v4, v12
-; GISEL-NEXT: v_add_i32_e64 v4, s[24:25], v16, v20
-; GISEL-NEXT: v_addc_u32_e64 v7, s[6:7], 0, v12, s[6:7]
-; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], v19, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[14:15]
-; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v17
-; GISEL-NEXT: v_subb_u32_e64 v17, s[6:7], v1, v4, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[16:17]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v17, v8
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[14:15], v17, v8
-; GISEL-NEXT: v_addc_u32_e64 v17, s[10:11], 0, v0, s[10:11]
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5]
-; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[18:19]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v8
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v1, v8
-; GISEL-NEXT: v_addc_u32_e64 v1, s[12:13], 0, v7, s[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[22:23]
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_subb_u32_e64 v11, vcc, v3, v2, s[8:9]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5
-; GISEL-NEXT: v_subb_u32_e64 v2, s[8:9], v2, v5, s[8:9]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v11, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v11
+; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10
+; GISEL-NEXT: v_mul_hi_u32 v14, v6, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8
+; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10
+; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7
+; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v7
+; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT: v_mul_hi_u32 v10, v2, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v5, v6
+; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7
+; GISEL-NEXT: v_mul_hi_u32 v11, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v9, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v4
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v6
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v7, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v2, vcc, 0, v2, s[20:21]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v16, s[14:15]
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v11, v4, s[8:9]
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[10:11]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v19, s[6:7]
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v2
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v13, v18, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v17, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v1, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v9
+; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom:
@@ -1904,16 +1904,14 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-LABEL: v_udiv_v2i64_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v4
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v6
; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0
-; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v1
-; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v0
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
+; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1
; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6
; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6
@@ -1929,76 +1927,78 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11
; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6
-; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6
-; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6
+; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12
+; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12
+; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12
; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7
; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7
; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13
+; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17
+; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19
+; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19
+; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18
; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14
-; GISEL-NEXT: v_mul_hi_u32 v20, v6, v14
+; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14
; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13
-; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17
-; GISEL-NEXT: v_mul_hi_u32 v21, v7, v17
; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12
-; GISEL-NEXT: v_mul_lo_u32 v19, v8, v12
-; GISEL-NEXT: v_mul_hi_u32 v22, v6, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT: v_mul_lo_u32 v23, v7, v13
-; GISEL-NEXT: v_mul_lo_u32 v24, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v25, v7, v13
+; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16
+; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18
+; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18
+; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17
+; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6
+; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2
+; GISEL-NEXT: v_mul_hi_u32 v2, v8, v16
; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v23
-; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v24, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v22
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v21
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v25
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v20
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v23, v18
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v21
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v18
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6
-; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v4, v12
+; GISEL-NEXT: v_mul_lo_u32 v5, v5, v12
+; GISEL-NEXT: v_mul_hi_u32 v14, v4, v12
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc
; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7
; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7
; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8
-; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12
-; GISEL-NEXT: v_mul_hi_u32 v17, v6, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12
+; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2
+; GISEL-NEXT: v_mul_lo_u32 v16, v2, v8
+; GISEL-NEXT: v_mul_hi_u32 v17, v12, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v2, v8
; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11
; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13
; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13
@@ -2007,140 +2007,140 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v9
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15
-; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v12, v4
+; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4
+; GISEL-NEXT: v_mul_hi_u32 v14, v12, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v2, v4
; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5
-; GISEL-NEXT: v_mul_lo_u32 v20, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v21, v7, v5
-; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v20, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v21
+; GISEL-NEXT: v_mul_lo_u32 v15, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v19, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v16, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19
+; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v15
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v17
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6
-; GISEL-NEXT: v_mul_hi_u32 v9, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v14
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, 0, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, 0, v8
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc
; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7
-; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7
+; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7
; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7
-; GISEL-NEXT: v_mul_lo_u32 v12, v3, v4
-; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4
-; GISEL-NEXT: v_mul_hi_u32 v14, v3, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT: v_mul_lo_u32 v15, v2, v5
+; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2
+; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2
+; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2
+; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2
+; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5
; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5
-; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5
+; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_lo_u32 v9, v1, v6
-; GISEL-NEXT: v_mul_lo_u32 v12, 0, v6
-; GISEL-NEXT: v_mul_hi_u32 v13, v1, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4
+; GISEL-NEXT: v_mul_hi_u32 v13, v3, v4
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_lo_u32 v11, v0, v7
+; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7
; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7
-; GISEL-NEXT: v_mul_hi_u32 v15, v0, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v15, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v2, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_mul_lo_u32 v8, v1, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v6
-; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc
-; GISEL-NEXT: v_mul_lo_u32 v17, v0, v5
+; GISEL-NEXT: v_mul_lo_u32 v2, v3, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4
+; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v17, v1, v5
; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v7
; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v16, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v18
-; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v19, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v15
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v8, vcc
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v8
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11
-; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v13, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v13, s[6:7], 0, v13
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v2, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11
+; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v12, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v14, s[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
-; GISEL-NEXT: v_cndmask_b32_e32 v11, -1, v15, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v13, vcc, 0, v13, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v11
+; GISEL-NEXT: v_add_i32_e64 v9, s[10:11], 1, v18
+; GISEL-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v19, s[10:11]
+; GISEL-NEXT: v_sub_i32_e64 v2, s[10:11], 0, v2
+; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], 0, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[6:7]
+; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v15, s[8:9]
+; GISEL-NEXT: v_subbrev_u32_e64 v12, vcc, 0, v12, s[4:5]
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v3
+; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v12, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
-; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v12, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v17, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v20, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v6, v19, v21, s[4:5]
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v9, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v11, s[4:5]
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v15
; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_udiv_v2i64_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index ffebde52df4a3e..e3c1a52696b47c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -1087,95 +1087,95 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0
; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1]
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15
+; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s14
; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14
-; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
-; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8
-; GFX8-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v2
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8
; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v6
; GFX8-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GFX8-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
-; GFX8-NEXT: v_trunc_f32_e32 v14, v2
-; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v14
+; GFX8-NEXT: v_trunc_f32_e32 v3, v2
+; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v3
; GFX8-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v1
-; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v7, s[0:1]
+; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v1
+; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v7, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v12, 0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v3
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10
; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v14
+; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3]
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10
+; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v12, v[2:3]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v16, v3, v16, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v14, v[2:3]
-; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v12
-; GFX8-NEXT: v_addc_u32_e64 v18, s[0:1], 0, v13, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v15, v[2:3]
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v4, vcc
-; GFX8-NEXT: v_mul_lo_u32 v4, v14, v1
-; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2
-; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10
-; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v3, vcc
-; GFX8-NEXT: v_mul_hi_u32 v3, v15, v1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
+; GFX8-NEXT: v_mul_lo_u32 v3, v15, v1
+; GFX8-NEXT: v_mul_lo_u32 v17, v12, v2
+; GFX8-NEXT: v_mul_hi_u32 v5, v12, v1
+; GFX8-NEXT: v_mul_hi_u32 v1, v15, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17
+; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v4, v14, v2
-; GFX8-NEXT: v_mul_hi_u32 v1, v14, v1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT: v_mul_hi_u32 v5, v15, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5
+; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v17, v3
+; GFX8-NEXT: v_mul_hi_u32 v17, v12, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
-; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v17
+; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v17
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13
+; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc
+; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10
+; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2
+; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v1
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, 0
-; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v1
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v12, 0
+; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v2, vcc
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v17, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v14, v[1:2]
-; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v18, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2]
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v15, v[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v12, v[4:5]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v6, v2, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v12, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v19, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, v14, v3
-; GFX8-NEXT: v_mul_lo_u32 v9, v15, v4
+; GFX8-NEXT: v_mul_lo_u32 v7, v15, v3
+; GFX8-NEXT: v_mul_lo_u32 v9, v12, v4
; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1]
-; GFX8-NEXT: v_mul_hi_u32 v8, v15, v3
+; GFX8-NEXT: v_mul_hi_u32 v8, v12, v3
; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v20, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v8, v14, v4
-; GFX8-NEXT: v_mul_hi_u32 v3, v14, v3
+; GFX8-NEXT: v_mul_lo_u32 v8, v15, v4
+; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4
+; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v9
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9
-; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4
+; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v12, v3
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc
; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3
; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4
; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
@@ -1216,27 +1216,27 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8
; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v9
-; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v10, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12
-; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12
-; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1]
-; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13
-; GFX8-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1]
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
+; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v9
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc
+; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v10, s[0:1]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v14
+; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v7
+; GFX8-NEXT: v_subbrev_u32_e64 v0, s[0:1], 0, v0, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v13, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v14, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v10, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
; GFX8-NEXT: v_mov_b32_e32 v9, s4
; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
@@ -1330,182 +1330,181 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1
-; GFX9-NEXT: v_mov_b32_e32 v6, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v9, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v5, s17
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v8, v[2:3]
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s16, v1
-; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v1
+; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v4, s17
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2]
+; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0
+; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v4, v2, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v1
-; GFX9-NEXT: v_sub_u32_e32 v3, s17, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6
-; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s4, v2
-; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v8
-; GFX9-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; GFX9-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
-; GFX9-NEXT: v_trunc_f32_e32 v15, v4
-; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v15
-; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3
-; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v12
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v11
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1]
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6
+; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc
+; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3
+; GFX9-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1
+; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc
+; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX9-NEXT: v_trunc_f32_e32 v4, v3
+; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4
+; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2
+; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7
+; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v12
-; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5]
-; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13
-; GFX9-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v14, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v16, v[4:5]
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
+; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2
+; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3
+; GFX9-NEXT: v_mul_hi_u32 v6, v12, v2
+; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17
+; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3
-; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4
-; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s4, v11
-; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc
-; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4
+; GFX9-NEXT: v_add_u32_e32 v4, v17, v4
+; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3
; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_mul_hi_u32 v7, v16, v4
-; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT: v_add_u32_e32 v6, v6, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v3
-; GFX9-NEXT: v_add3_u32 v4, v6, v5, v4
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v16, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v4, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v18, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v15, v[3:4]
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v19, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[2:3], s3, v16, v[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v4, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v13, s[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v8, v15, v5
-; GFX9-NEXT: v_mul_lo_u32 v9, v16, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v20, vcc
-; GFX9-NEXT: v_mul_hi_u32 v11, v16, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v21, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v11
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17
+; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v6, v6, v17
+; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13
+; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc
+; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10
+; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2
+; GFX9-NEXT: v_add3_u32 v3, v6, v4, v3
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1]
+; GFX9-NEXT: v_mul_lo_u32 v7, v15, v4
+; GFX9-NEXT: v_mul_lo_u32 v8, v12, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc
+; GFX9-NEXT: v_mul_hi_u32 v10, v12, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v20, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v11, v15, v6
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v10, v15, v5
+; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4
+; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
+; GFX9-NEXT: v_mul_hi_u32 v8, v12, v5
; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5
-; GFX9-NEXT: v_add_u32_e32 v8, v9, v8
-; GFX9-NEXT: v_mul_hi_u32 v9, v16, v6
-; GFX9-NEXT: v_mul_hi_u32 v6, v15, v6
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v11, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT: v_add_u32_e32 v9, v11, v9
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v16, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v6, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, s19, v5
-; GFX9-NEXT: v_mul_lo_u32 v9, s18, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v7, s[0:1]
-; GFX9-NEXT: v_mul_hi_u32 v2, s18, v5
-; GFX9-NEXT: v_mul_hi_u32 v5, s19, v5
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, s19, v6
-; GFX9-NEXT: v_add_u32_e32 v2, v9, v2
-; GFX9-NEXT: v_mul_hi_u32 v9, s18, v6
-; GFX9-NEXT: v_mul_hi_u32 v13, s19, v6
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v5, v2
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s6, v12, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v10, s[0:1]
-; GFX9-NEXT: v_add_u32_e32 v1, v11, v9
-; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13
-; GFX9-NEXT: v_mov_b32_e32 v1, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v9, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v10, s19
-; GFX9-NEXT: v_mov_b32_e32 v6, s7
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v12, v[1:2]
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s18, v5
-; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v10
-; GFX9-NEXT: v_sub_u32_e32 v1, s19, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v10
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s6, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12
-; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v13
-; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v11
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v13
-; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s6, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14
-; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1]
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v6, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v14, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v19, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1]
-; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[12:13]
-; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[14:15]
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7
+; GFX9-NEXT: v_add_u32_e32 v8, v10, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v5, v8, v7, v5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
+; GFX9-NEXT: v_mul_lo_u32 v7, s19, v4
+; GFX9-NEXT: v_mul_lo_u32 v8, s18, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1]
+; GFX9-NEXT: v_mul_hi_u32 v1, s18, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v7, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v7, s19, v5
+; GFX9-NEXT: v_add_u32_e32 v1, v8, v1
+; GFX9-NEXT: v_mul_hi_u32 v8, s18, v5
+; GFX9-NEXT: v_mul_hi_u32 v12, s19, v5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v1
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v9, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, v10, v8
+; GFX9-NEXT: v_add3_u32 v8, v0, v1, v12
+; GFX9-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v8, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v9, s19
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[0:1]
+; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s18, v4
+; GFX9-NEXT: v_subb_co_u32_e64 v9, s[0:1], v9, v0, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT: v_sub_u32_e32 v0, s19, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s6, v1
+; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v0, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v12
+; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v10
+; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v12
+; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 1, v14
+; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
+; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s6, v10
+; GFX9-NEXT: v_subbrev_co_u32_e64 v0, s[0:1], 0, v0, s[0:1]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT: v_mov_b32_e32 v13, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v5, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v14, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v8, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v13, v[2:5], s[12:13]
+; GFX9-NEXT: global_store_dwordx4 v13, v[6:9], s[14:15]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index faad7e93da5d37..2be4b52198b455 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -359,254 +359,254 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-LABEL: v_urem_v2i64:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7
-; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6
-; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc
-; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5]
-; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11
-; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13
-; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10
-; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11
-; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10
-; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11
-; GISEL-NEXT: v_trunc_f32_e32 v13, v13
-; GISEL-NEXT: v_trunc_f32_e32 v14, v14
-; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10
-; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14
-; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18
-; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20
-; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10
-; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16
-; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19
-; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16
-; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21
-; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19
-; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21
-; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19
-; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18
-; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16
-; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19
-; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17
-; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18
-; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16
-; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17
-; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11]
-; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15]
-; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22
-; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23
-; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17]
-; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20
-; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22
-; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19
-; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16
-; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17
-; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19
-; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10
-; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18
-; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19
-; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11
-; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11
-; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20
-; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8
-; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
-; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19
-; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16
-; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5
+; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8
+; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8
+; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8
+; GISEL-NEXT: v_trunc_f32_e32 v9, v9
+; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8
+; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v15, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12
+; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13
+; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17
-; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9
-; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17
-; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9
-; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11]
-; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19
-; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10
+; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v14, v8, v10
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8
+; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9
+; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17
-; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18
-; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19
-; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10
-; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10
-; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11
-; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11
-; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12
-; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc
-; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8
-; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8
-; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9
; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8
-; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9
-; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12
-; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9
-; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10
-; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15
-; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14
-; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19
-; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16
+; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8
+; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v4, v8
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
+; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v4
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v5, s[4:5]
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7
+; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
+; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4
+; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10
+; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11
+; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11
+; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10
+; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v16, v4, v10
-; GISEL-NEXT: v_mul_lo_u32 v17, v5, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10
-; GISEL-NEXT: v_mul_lo_u32 v18, v6, v11
-; GISEL-NEXT: v_mul_lo_u32 v19, v7, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v6, v11
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
-; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18
-; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13
-; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6
-; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4
-; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8
-; GISEL-NEXT: v_mul_lo_u32 v9, v6, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v6
-; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4
-; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6
-; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8
-; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v10
-; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11
-; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc
-; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8
-; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v7
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v8, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4
+; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v5, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8
+; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10
+; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5
+; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v6, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v6
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v6
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11]
-; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[10:11]
-; GISEL-NEXT: v_subbrev_u32_e64 v19, vcc, 0, v3, s[12:13]
-; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9]
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v19, v7
-; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v5
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v19, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[8:9]
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5]
+; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_urem_v2i64:
@@ -1103,20 +1103,20 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7
; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
; GISEL-NEXT: s_cmp_lg_u32 s4, 0
-; GISEL-NEXT: s_subb_u32 s4, 0, 0
+; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
-; GISEL-NEXT: s_subb_u32 s5, 0, 0
+; GISEL-NEXT: s_subb_u32 s7, 0, 0
; GISEL-NEXT: v_trunc_f32_e32 v7, v7
; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7
; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
; GISEL-NEXT: v_mul_lo_u32 v8, v7, v5
; GISEL-NEXT: v_mul_lo_u32 v9, v6, v5
-; GISEL-NEXT: v_mul_lo_u32 v10, s4, v6
+; GISEL-NEXT: v_mul_lo_u32 v10, s6, v6
; GISEL-NEXT: v_mul_hi_u32 v11, v6, v5
-; GISEL-NEXT: v_mul_lo_u32 v12, s5, v6
+; GISEL-NEXT: v_mul_lo_u32 v12, s7, v6
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v8
; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9
; GISEL-NEXT: v_mul_hi_u32 v14, v6, v9
@@ -1134,41 +1134,41 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v17, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v17, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v11
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v15
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v18
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v14
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v11
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v7, v10, vcc
; GISEL-NEXT: v_mul_lo_u32 v12, v11, v5
-; GISEL-NEXT: v_mul_lo_u32 v13, s4, v11
+; GISEL-NEXT: v_mul_lo_u32 v13, s6, v11
; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, s5, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, s7, v6
; GISEL-NEXT: v_mul_hi_u32 v15, v6, v5
; GISEL-NEXT: v_mul_lo_u32 v16, v10, v5
; GISEL-NEXT: v_mul_lo_u32 v17, v10, v12
@@ -1176,9 +1176,9 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12
; GISEL-NEXT: v_mul_lo_u32 v5, v7, v5
; GISEL-NEXT: v_mul_lo_u32 v19, v7, v8
-; GISEL-NEXT: v_mul_hi_u32 v20, v6, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT: v_mul_hi_u32 v16, v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v14
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15
@@ -1186,38 +1186,38 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9
; GISEL-NEXT: v_mul_hi_u32 v15, v11, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v10, v9
-; GISEL-NEXT: v_mul_lo_u32 v16, v6, v5
-; GISEL-NEXT: v_mul_lo_u32 v21, v7, v5
-; GISEL-NEXT: v_mul_hi_u32 v22, v6, v5
-; GISEL-NEXT: v_mul_hi_u32 v5, v7, v5
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v21, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v6, v5
+; GISEL-NEXT: v_mul_lo_u32 v18, v7, v5
+; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v19, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
+; GISEL-NEXT: v_mul_hi_u32 v13, v6, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v7, v5
+; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v14, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7]
+; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v18, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v22
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v16
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v21, v18
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v18
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v15
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v9, vcc
; GISEL-NEXT: v_mul_lo_u32 v10, v1, v11
@@ -1675,254 +1675,254 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_mov_b32_e32 v10, 0
; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4
; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5
-; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4
-; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc
-; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5]
-; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11
-; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13
-; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10
-; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11
-; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10
-; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11
-; GISEL-NEXT: v_trunc_f32_e32 v13, v13
-; GISEL-NEXT: v_trunc_f32_e32 v14, v14
-; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10
-; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14
-; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18
-; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20
-; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16
-; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19
-; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16
-; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21
-; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19
-; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21
-; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19
-; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18
-; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16
-; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19
-; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17
-; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18
-; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16
-; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17
-; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11]
-; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15]
-; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22
-; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23
-; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17]
-; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20
-; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22
-; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19
-; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16
-; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17
-; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19
-; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18
-; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19
-; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11
-; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11
-; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20
-; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
-; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20
-; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19
-; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16
-; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7
+; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8
+; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6
+; GISEL-NEXT: v_trunc_f32_e32 v9, v9
+; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6
+; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6
+; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v15, v10, v6
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12
+; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13
+; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v16, v6, v13
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17
-; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6
-; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9
-; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20
-; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9
-; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17
-; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9
-; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11]
-; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19
-; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12
+; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10
+; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17
-; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18
-; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19
-; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10
-; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10
-; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11
-; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11
-; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12
-; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15
-; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc
-; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6
-; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6
-; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9
+; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9
; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6
-; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9
-; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12
-; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9
-; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10
-; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15
-; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14
-; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19
-; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16
+; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v9
+; GISEL-NEXT: v_mul_hi_u32 v6, v7, v6
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v6, vcc
+; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v7
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5]
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5
+; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6
+; GISEL-NEXT: v_trunc_f32_e32 v7, v7
+; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
+; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v6
+; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v6
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10
+; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11
+; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9]
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v11
+; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10
+; GISEL-NEXT: v_mul_hi_u32 v14, v6, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v16, v7, v10
-; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8
; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT: v_mul_lo_u32 v18, v4, v11
-; GISEL-NEXT: v_mul_lo_u32 v19, v5, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
-; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18
-; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v13
-; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v7
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v4
-; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v7
-; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v4
-; GISEL-NEXT: v_mul_lo_u32 v6, v7, v6
-; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v7
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v4
-; GISEL-NEXT: v_sub_i32_e64 v7, s[14:15], v12, v7
-; GISEL-NEXT: v_sub_i32_e64 v4, s[16:17], v13, v4
-; GISEL-NEXT: v_add_i32_e64 v6, s[18:19], v17, v6
-; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9]
-; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v10
-; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11
-; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v6, vcc
-; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v6
-; GISEL-NEXT: v_subb_u32_e64 v6, s[6:7], v3, v9, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11]
-; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v8, s[10:11]
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5]
-; GISEL-NEXT: v_subbrev_u32_e64 v14, vcc, 0, v3, s[12:13]
-; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v5, s[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9]
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v8
-; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v8
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7
+; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v7
+; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT: v_mul_hi_u32 v10, v2, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v5, v6
+; GISEL-NEXT: v_mul_lo_u32 v7, v4, v7
+; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[8:9]
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v12, v7, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v3, s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v4
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5]
+; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_urem_v2i64_pow2_shl_denom:
@@ -2319,16 +2319,14 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-LABEL: v_urem_v2i64_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v4
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v6
; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0
-; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v1
-; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v0
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
+; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1
; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6
; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6
@@ -2344,76 +2342,78 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11
; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6
-; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6
-; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6
+; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12
+; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12
+; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12
; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7
; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7
; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13
+; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17
+; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19
+; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19
+; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18
; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14
-; GISEL-NEXT: v_mul_hi_u32 v20, v6, v14
+; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14
; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13
-; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17
-; GISEL-NEXT: v_mul_hi_u32 v21, v7, v17
; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12
-; GISEL-NEXT: v_mul_lo_u32 v19, v8, v12
-; GISEL-NEXT: v_mul_hi_u32 v22, v6, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT: v_mul_lo_u32 v23, v7, v13
-; GISEL-NEXT: v_mul_lo_u32 v24, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v25, v7, v13
+; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16
+; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18
+; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18
+; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17
+; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6
+; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2
+; GISEL-NEXT: v_mul_hi_u32 v2, v8, v16
; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v23
-; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v24, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v22
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v21
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v25
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v20
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v23, v18
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v21
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v18
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6
-; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v4, v12
+; GISEL-NEXT: v_mul_lo_u32 v5, v5, v12
+; GISEL-NEXT: v_mul_hi_u32 v14, v4, v12
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc
; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7
; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7
; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8
-; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12
-; GISEL-NEXT: v_mul_hi_u32 v17, v6, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12
+; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2
+; GISEL-NEXT: v_mul_lo_u32 v16, v2, v8
+; GISEL-NEXT: v_mul_hi_u32 v17, v12, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v2, v8
; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11
; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13
; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13
@@ -2422,136 +2422,136 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v9
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15
-; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v12, v4
+; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4
+; GISEL-NEXT: v_mul_hi_u32 v14, v12, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v2, v4
; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5
-; GISEL-NEXT: v_mul_lo_u32 v20, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v21, v7, v5
-; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v20, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v21
+; GISEL-NEXT: v_mul_lo_u32 v15, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v19, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v16, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19
+; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v15
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v17
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6
-; GISEL-NEXT: v_mul_hi_u32 v9, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v14
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, 0, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, 0, v8
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc
; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7
-; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7
+; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7
; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7
-; GISEL-NEXT: v_mul_lo_u32 v12, v3, v4
-; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4
-; GISEL-NEXT: v_mul_hi_u32 v14, v3, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT: v_mul_lo_u32 v15, v2, v5
+; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2
+; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2
+; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2
+; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2
+; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5
; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5
-; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5
+; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_lo_u32 v9, v1, v6
-; GISEL-NEXT: v_mul_lo_u32 v12, 0, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_lo_u32 v11, v0, v7
+; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7
; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v0, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_mul_lo_u32 v4, v1, v4
-; GISEL-NEXT: v_mul_lo_u32 v5, v0, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4
+; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2
+; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v4, vcc
-; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], 0, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v9
+; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc
+; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11
-; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], 0, v5, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v5, s[6:7], 0, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11
+; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[6:7]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v7, -1, v7, s[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
-; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v9, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v3, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v0, vcc, 0, v0, s[4:5]
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v5, v3
+; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v0
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v8, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v11, -1, v11, vcc
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v4, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v13, -1, v13, vcc
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v10, v3
+; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v2, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
+; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, v0, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v14, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v15, s[4:5]
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v13, v15, s[4:5]
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_urem_v2i64_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 0042d34e235d17..4faa7edadf07a5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -1346,29 +1346,29 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16
; GFX6-NEXT: v_min_u32_e32 v16, v3, v19
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16
-; GFX6-NEXT: v_min_u32_e32 v16, v4, v20
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v16
-; GFX6-NEXT: v_min_u32_e32 v16, v5, v21
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v16
-; GFX6-NEXT: v_min_u32_e32 v16, v6, v22
-; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v16
-; GFX6-NEXT: v_min_u32_e32 v16, v7, v23
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v16
-; GFX6-NEXT: v_min_u32_e32 v16, v8, v24
-; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v16
-; GFX6-NEXT: v_min_u32_e32 v16, v9, v25
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16
-; GFX6-NEXT: v_min_u32_e32 v16, v10, v26
-; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v16
; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX6-NEXT: v_min_u32_e32 v17, v11, v27
-; GFX6-NEXT: v_min_u32_e32 v18, v12, v28
-; GFX6-NEXT: v_min_u32_e32 v19, v13, v29
-; GFX6-NEXT: v_min_u32_e32 v20, v14, v30
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17
-; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v18
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v19
-; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v20
+; GFX6-NEXT: v_min_u32_e32 v17, v4, v20
+; GFX6-NEXT: v_min_u32_e32 v18, v5, v21
+; GFX6-NEXT: v_min_u32_e32 v19, v6, v22
+; GFX6-NEXT: v_min_u32_e32 v20, v7, v23
+; GFX6-NEXT: v_min_u32_e32 v21, v8, v24
+; GFX6-NEXT: v_min_u32_e32 v22, v9, v25
+; GFX6-NEXT: v_min_u32_e32 v23, v10, v26
+; GFX6-NEXT: v_min_u32_e32 v24, v11, v27
+; GFX6-NEXT: v_min_u32_e32 v25, v12, v28
+; GFX6-NEXT: v_min_u32_e32 v26, v13, v29
+; GFX6-NEXT: v_min_u32_e32 v27, v14, v30
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v18
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v19
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v20
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v21
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v22
+; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v23
+; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v24
+; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v25
+; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v26
+; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v27
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_min_u32_e32 v16, v15, v16
; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
index daed0986fa9c88..0ae2b4f549919d 100644
--- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
@@ -823,32 +823,32 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) {
; GFX8-NEXT: v_sub_u16_sdwa v14, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_u16_sdwa v15, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_u16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v0
+; GFX8-NEXT: v_max_i16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v19
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT: v_sub_u16_e32 v8, 0, v1
+; GFX8-NEXT: v_max_i16_sdwa v15, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v8
; GFX8-NEXT: v_sub_u16_e32 v16, 0, v7
; GFX8-NEXT: v_sub_u16_e32 v17, 0, v6
; GFX8-NEXT: v_sub_u16_e32 v18, 0, v5
; GFX8-NEXT: v_sub_u16_e32 v19, 0, v4
-; GFX8-NEXT: v_sub_u16_e32 v20, 0, v3
-; GFX8-NEXT: v_sub_u16_e32 v21, 0, v2
-; GFX8-NEXT: v_sub_u16_e32 v22, 0, v1
-; GFX8-NEXT: v_sub_u16_e32 v23, 0, v0
+; GFX8-NEXT: v_sub_u16_e32 v8, 0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v15
+; GFX8-NEXT: v_sub_u16_e32 v15, 0, v2
; GFX8-NEXT: v_max_i16_sdwa v9, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v12, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v13, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v14, v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_sdwa v15, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v0, v0, v23
-; GFX8-NEXT: v_max_i16_e32 v1, v1, v22
-; GFX8-NEXT: v_max_i16_e32 v2, v2, v21
-; GFX8-NEXT: v_max_i16_e32 v3, v3, v20
+; GFX8-NEXT: v_max_i16_e32 v2, v2, v15
+; GFX8-NEXT: v_max_i16_e32 v3, v3, v8
; GFX8-NEXT: v_max_i16_e32 v4, v4, v19
; GFX8-NEXT: v_max_i16_e32 v5, v5, v18
; GFX8-NEXT: v_max_i16_e32 v6, v6, v17
; GFX8-NEXT: v_max_i16_e32 v7, v7, v16
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v15
; GFX8-NEXT: v_or_b32_e32 v2, v2, v14
; GFX8-NEXT: v_or_b32_e32 v3, v3, v13
; GFX8-NEXT: v_or_b32_e32 v4, v4, v12
@@ -1255,85 +1255,85 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v16, 0
-; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_sub_u16_e32 v20, 0, v0
-; GFX8-NEXT: v_max_i16_sdwa v19, v0, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v0, v0, v20
-; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v19
-; GFX8-NEXT: v_sub_u16_e32 v19, 0, v1
-; GFX8-NEXT: v_max_i16_sdwa v20, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v1, v1, v19
-; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v20
-; GFX8-NEXT: v_sub_u16_e32 v20, 0, v2
-; GFX8-NEXT: v_max_i16_sdwa v19, v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v2, v2, v20
-; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v19
-; GFX8-NEXT: v_sub_u16_e32 v19, 0, v3
-; GFX8-NEXT: v_max_i16_sdwa v20, v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v3, v3, v19
-; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v20
-; GFX8-NEXT: v_sub_u16_e32 v20, 0, v4
-; GFX8-NEXT: v_max_i16_sdwa v19, v4, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v4, v4, v20
-; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v4, v4, v19
-; GFX8-NEXT: v_sub_u16_e32 v19, 0, v5
-; GFX8-NEXT: v_max_i16_sdwa v20, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v5, v5, v19
-; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v5, v5, v20
-; GFX8-NEXT: v_sub_u16_e32 v20, 0, v6
-; GFX8-NEXT: v_max_i16_sdwa v19, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v6, v6, v20
-; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v6, v6, v19
-; GFX8-NEXT: v_sub_u16_e32 v19, 0, v7
-; GFX8-NEXT: v_max_i16_sdwa v20, v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v7, v7, v19
-; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v7, v7, v20
-; GFX8-NEXT: v_sub_u16_e32 v20, 0, v8
-; GFX8-NEXT: v_max_i16_sdwa v19, v8, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v8, v8, v20
-; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v8, v8, v19
-; GFX8-NEXT: v_sub_u16_e32 v19, 0, v9
-; GFX8-NEXT: v_max_i16_sdwa v20, v9, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v9, v9, v19
-; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v9, v9, v20
-; GFX8-NEXT: v_sub_u16_e32 v20, 0, v10
-; GFX8-NEXT: v_max_i16_sdwa v19, v10, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v10, v10, v20
-; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v10, v10, v19
-; GFX8-NEXT: v_sub_u16_e32 v19, 0, v11
-; GFX8-NEXT: v_max_i16_sdwa v20, v11, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v11, v11, v19
+; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v0
+; GFX8-NEXT: v_max_i16_sdwa v18, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v19
+; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v18
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v1
+; GFX8-NEXT: v_max_i16_sdwa v19, v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v18
+; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v19
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v2
+; GFX8-NEXT: v_max_i16_sdwa v18, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v2, v2, v19
+; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v18
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v3
+; GFX8-NEXT: v_max_i16_sdwa v19, v3, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v3, v3, v18
+; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v19
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v4
+; GFX8-NEXT: v_max_i16_sdwa v18, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v4, v4, v19
+; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v18
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v5
+; GFX8-NEXT: v_max_i16_sdwa v19, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v5, v5, v18
+; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v19
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v6
+; GFX8-NEXT: v_max_i16_sdwa v18, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v6, v6, v19
+; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v18
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v7
+; GFX8-NEXT: v_max_i16_sdwa v19, v7, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v7, v7, v18
+; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v19
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v8
+; GFX8-NEXT: v_max_i16_sdwa v18, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v8, v8, v19
+; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v8, v8, v18
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v9
+; GFX8-NEXT: v_max_i16_sdwa v19, v9, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v9, v9, v18
+; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v9, v9, v19
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v10
+; GFX8-NEXT: v_max_i16_sdwa v18, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v10, v10, v19
+; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v10, v10, v18
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v11
+; GFX8-NEXT: v_max_i16_sdwa v19, v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v11, v11, v18
+; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v11, v11, v19
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v12
+; GFX8-NEXT: v_max_i16_sdwa v18, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v12, v12, v19
; GFX8-NEXT: v_sub_u16_sdwa v17, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_sub_u16_sdwa v16, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v11, v11, v20
-; GFX8-NEXT: v_sub_u16_e32 v20, 0, v12
-; GFX8-NEXT: v_max_i16_sdwa v16, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_e32 v12, v12, v20
-; GFX8-NEXT: v_or_b32_e32 v12, v12, v16
-; GFX8-NEXT: v_sub_u16_e32 v16, 0, v13
-; GFX8-NEXT: v_max_i16_sdwa v19, v13, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_sub_u16_e32 v20, 0, v15
-; GFX8-NEXT: v_max_i16_e32 v13, v13, v16
+; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_u16_sdwa v16, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v12, v12, v18
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v13
+; GFX8-NEXT: v_max_i16_sdwa v16, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_e32 v13, v13, v18
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v15
+; GFX8-NEXT: v_or_b32_e32 v13, v13, v16
; GFX8-NEXT: v_sub_u16_e32 v16, 0, v14
; GFX8-NEXT: v_max_i16_sdwa v17, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_sdwa v18, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_sdwa v19, v14, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_e32 v14, v14, v16
-; GFX8-NEXT: v_max_i16_e32 v15, v15, v20
-; GFX8-NEXT: v_or_b32_e32 v13, v13, v19
-; GFX8-NEXT: v_or_b32_e32 v14, v14, v18
+; GFX8-NEXT: v_max_i16_e32 v15, v15, v18
+; GFX8-NEXT: v_or_b32_e32 v14, v14, v19
; GFX8-NEXT: v_or_b32_e32 v15, v15, v17
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 033af692438015..cd5b585a8c4e23 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -474,44 +474,44 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_add_i32 s4, s11, s39
-; GFX6-NEXT: s_add_i32 s5, s10, s38
-; GFX6-NEXT: s_add_i32 s6, s9, s37
-; GFX6-NEXT: s_add_i32 s7, s8, s36
-; GFX6-NEXT: s_add_i32 s8, s15, s43
-; GFX6-NEXT: s_add_i32 s9, s14, s42
-; GFX6-NEXT: s_add_i32 s10, s13, s41
-; GFX6-NEXT: s_add_i32 s11, s12, s40
-; GFX6-NEXT: s_add_i32 s12, s19, s47
-; GFX6-NEXT: s_add_i32 s13, s18, s46
-; GFX6-NEXT: s_add_i32 s14, s17, s45
-; GFX6-NEXT: s_add_i32 s15, s16, s44
-; GFX6-NEXT: s_add_i32 s16, s23, s51
-; GFX6-NEXT: s_add_i32 s17, s22, s50
-; GFX6-NEXT: s_add_i32 s18, s21, s49
-; GFX6-NEXT: s_add_i32 s19, s20, s48
-; GFX6-NEXT: v_mov_b32_e32 v0, s19
-; GFX6-NEXT: v_mov_b32_e32 v1, s18
-; GFX6-NEXT: v_mov_b32_e32 v2, s17
-; GFX6-NEXT: v_mov_b32_e32 v3, s16
+; GFX6-NEXT: s_add_i32 s6, s11, s39
+; GFX6-NEXT: s_add_i32 s7, s10, s38
+; GFX6-NEXT: s_add_i32 s10, s15, s43
+; GFX6-NEXT: s_add_i32 s11, s14, s42
+; GFX6-NEXT: s_add_i32 s14, s19, s47
+; GFX6-NEXT: s_add_i32 s15, s18, s46
+; GFX6-NEXT: s_add_i32 s18, s23, s51
+; GFX6-NEXT: s_add_i32 s19, s22, s50
+; GFX6-NEXT: s_add_i32 s21, s21, s49
+; GFX6-NEXT: s_add_i32 s20, s20, s48
+; GFX6-NEXT: s_add_i32 s17, s17, s45
+; GFX6-NEXT: s_add_i32 s16, s16, s44
+; GFX6-NEXT: v_mov_b32_e32 v0, s20
+; GFX6-NEXT: v_mov_b32_e32 v1, s21
+; GFX6-NEXT: v_mov_b32_e32 v2, s19
+; GFX6-NEXT: v_mov_b32_e32 v3, s18
+; GFX6-NEXT: s_add_i32 s13, s13, s41
+; GFX6-NEXT: s_add_i32 s12, s12, s40
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s15
-; GFX6-NEXT: v_mov_b32_e32 v1, s14
-; GFX6-NEXT: v_mov_b32_e32 v2, s13
-; GFX6-NEXT: v_mov_b32_e32 v3, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: v_mov_b32_e32 v1, s17
+; GFX6-NEXT: v_mov_b32_e32 v2, s15
+; GFX6-NEXT: v_mov_b32_e32 v3, s14
+; GFX6-NEXT: s_add_i32 s9, s9, s37
+; GFX6-NEXT: s_add_i32 s8, s8, s36
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s11
-; GFX6-NEXT: v_mov_b32_e32 v1, s10
-; GFX6-NEXT: v_mov_b32_e32 v2, s9
-; GFX6-NEXT: v_mov_b32_e32 v3, s8
+; GFX6-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NEXT: v_mov_b32_e32 v1, s13
+; GFX6-NEXT: v_mov_b32_e32 v2, s11
+; GFX6-NEXT: v_mov_b32_e32 v3, s10
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s7
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: v_mov_b32_e32 v2, s5
-; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: v_mov_b32_e32 v1, s9
+; GFX6-NEXT: v_mov_b32_e32 v2, s7
+; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 236956c1829e77..f176f34f847366 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -485,13 +485,10 @@ define <16 x ptr addrspace(5)> @addrspacecast_v16p0_to_v16p5(<16 x ptr> %ptr) {
; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; HSA-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[24:25]
; HSA-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; HSA-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[26:27]
; HSA-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; HSA-NEXT: v_cmp_ne_u64_e64 s[8:9], 0, v[28:29]
; HSA-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; HSA-NEXT: v_cndmask_b32_e32 v3, -1, v6, vcc
@@ -500,13 +497,10 @@ define <16 x ptr addrspace(5)> @addrspacecast_v16p0_to_v16p5(<16 x ptr> %ptr) {
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
; HSA-NEXT: v_cndmask_b32_e32 v5, -1, v10, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; HSA-NEXT: v_cndmask_b32_e64 v13, -1, v26, s[6:7]
; HSA-NEXT: v_cndmask_b32_e32 v6, -1, v12, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; HSA-NEXT: v_cndmask_b32_e64 v12, -1, v24, s[4:5]
; HSA-NEXT: v_cndmask_b32_e32 v7, -1, v14, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; HSA-NEXT: v_cndmask_b32_e64 v14, -1, v28, s[8:9]
; HSA-NEXT: v_cndmask_b32_e32 v8, -1, v16, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
; HSA-NEXT: v_cndmask_b32_e32 v9, -1, v18, vcc
@@ -514,6 +508,12 @@ define <16 x ptr addrspace(5)> @addrspacecast_v16p0_to_v16p5(<16 x ptr> %ptr) {
; HSA-NEXT: v_cndmask_b32_e32 v10, -1, v20, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23]
; HSA-NEXT: v_cndmask_b32_e32 v11, -1, v22, vcc
+; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25]
+; HSA-NEXT: v_cndmask_b32_e32 v12, -1, v24, vcc
+; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[26:27]
+; HSA-NEXT: v_cndmask_b32_e32 v13, -1, v26, vcc
+; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[28:29]
+; HSA-NEXT: v_cndmask_b32_e32 v14, -1, v28, vcc
; HSA-NEXT: s_waitcnt vmcnt(0)
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[30:31]
; HSA-NEXT: v_cndmask_b32_e32 v15, -1, v30, vcc
@@ -733,65 +733,64 @@ define <16 x ptr> @addrspacecast_v16p5_to_v16p0(<16 x ptr addrspace(5)> %ptr) {
; CI-NEXT: s_load_dword s4, s[6:7], 0x11
; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6
-; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7
+; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5
+; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v31, s4
-; CI-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc
+; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc
; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1
-; CI-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc
-; CI-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc
+; CI-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc
+; CI-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc
; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2
-; CI-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc
-; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc
+; CI-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc
+; CI-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc
; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3
-; CI-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc
-; CI-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc
-; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4
-; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5
-; CI-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc
-; CI-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5]
-; CI-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7]
-; CI-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9]
-; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8
-; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9
-; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10
-; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11
-; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12
-; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13
-; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14
-; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15
-; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11]
-; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13]
-; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15]
-; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17]
-; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19]
-; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21]
-; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23]
-; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25]
-; CI-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc
-; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5]
-; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7]
-; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9]
-; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11]
-; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13]
-; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15]
-; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17]
-; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19]
-; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21]
-; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23]
-; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25]
-; CI-NEXT: v_mov_b32_e32 v1, v48
-; CI-NEXT: v_mov_b32_e32 v2, v35
-; CI-NEXT: v_mov_b32_e32 v3, v33
-; CI-NEXT: v_mov_b32_e32 v4, v36
-; CI-NEXT: v_mov_b32_e32 v5, v49
-; CI-NEXT: v_mov_b32_e32 v6, v37
-; CI-NEXT: v_mov_b32_e32 v7, v34
-; CI-NEXT: v_mov_b32_e32 v8, v38
-; CI-NEXT: v_mov_b32_e32 v10, v50
-; CI-NEXT: v_mov_b32_e32 v12, v39
-; CI-NEXT: v_mov_b32_e32 v14, v32
+; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4
+; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7
+; CI-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc
+; CI-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5]
+; CI-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7]
+; CI-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9]
+; CI-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11]
+; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8
+; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9
+; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10
+; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11
+; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12
+; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13
+; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14
+; CI-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15
+; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13]
+; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15]
+; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17]
+; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19]
+; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21]
+; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23]
+; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25]
+; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27]
+; CI-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc
+; CI-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5]
+; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7]
+; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9]
+; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11]
+; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13]
+; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15]
+; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17]
+; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19]
+; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21]
+; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23]
+; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25]
+; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27]
+; CI-NEXT: v_mov_b32_e32 v1, v49
+; CI-NEXT: v_mov_b32_e32 v2, v34
+; CI-NEXT: v_mov_b32_e32 v3, v39
+; CI-NEXT: v_mov_b32_e32 v4, v35
+; CI-NEXT: v_mov_b32_e32 v5, v32
+; CI-NEXT: v_mov_b32_e32 v6, v36
+; CI-NEXT: v_mov_b32_e32 v8, v48
+; CI-NEXT: v_mov_b32_e32 v10, v37
+; CI-NEXT: v_mov_b32_e32 v12, v33
+; CI-NEXT: v_mov_b32_e32 v14, v38
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: addrspacecast_v16p5_to_v16p0:
@@ -801,63 +800,62 @@ define <16 x ptr> @addrspacecast_v16p5_to_v16p0(<16 x ptr addrspace(5)> %ptr) {
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
; GFX9-NEXT: v_mov_b32_e32 v31, s5
; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15
-; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13]
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15]
-; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17]
-; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19]
-; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21]
-; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23]
-; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25]
-; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9]
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13]
-; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15]
-; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17]
-; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19]
-; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21]
-; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23]
-; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25]
-; GFX9-NEXT: v_mov_b32_e32 v1, v48
-; GFX9-NEXT: v_mov_b32_e32 v2, v35
-; GFX9-NEXT: v_mov_b32_e32 v3, v33
-; GFX9-NEXT: v_mov_b32_e32 v4, v36
-; GFX9-NEXT: v_mov_b32_e32 v5, v49
-; GFX9-NEXT: v_mov_b32_e32 v6, v37
-; GFX9-NEXT: v_mov_b32_e32 v7, v34
-; GFX9-NEXT: v_mov_b32_e32 v8, v38
-; GFX9-NEXT: v_mov_b32_e32 v10, v50
-; GFX9-NEXT: v_mov_b32_e32 v12, v39
-; GFX9-NEXT: v_mov_b32_e32 v14, v32
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15
+; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13]
+; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17]
+; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21]
+; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23]
+; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25]
+; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13]
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17]
+; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21]
+; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23]
+; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25]
+; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27]
+; GFX9-NEXT: v_mov_b32_e32 v1, v49
+; GFX9-NEXT: v_mov_b32_e32 v2, v34
+; GFX9-NEXT: v_mov_b32_e32 v3, v39
+; GFX9-NEXT: v_mov_b32_e32 v4, v35
+; GFX9-NEXT: v_mov_b32_e32 v5, v32
+; GFX9-NEXT: v_mov_b32_e32 v6, v36
+; GFX9-NEXT: v_mov_b32_e32 v8, v48
+; GFX9-NEXT: v_mov_b32_e32 v10, v37
+; GFX9-NEXT: v_mov_b32_e32 v12, v33
+; GFX9-NEXT: v_mov_b32_e32 v14, v38
; GFX9-NEXT: s_setpc_b64 s[30:31]
%cast = addrspacecast <16 x ptr addrspace(5)> %ptr to <16 x ptr>
ret <16 x ptr> %cast
@@ -939,13 +937,10 @@ define <16 x ptr addrspace(3)> @addrspacecast_v16p0_to_v16p3(<16 x ptr> %ptr) {
; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; HSA-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[24:25]
; HSA-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; HSA-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[26:27]
; HSA-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; HSA-NEXT: v_cmp_ne_u64_e64 s[8:9], 0, v[28:29]
; HSA-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; HSA-NEXT: v_cndmask_b32_e32 v3, -1, v6, vcc
@@ -954,13 +949,10 @@ define <16 x ptr addrspace(3)> @addrspacecast_v16p0_to_v16p3(<16 x ptr> %ptr) {
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
; HSA-NEXT: v_cndmask_b32_e32 v5, -1, v10, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; HSA-NEXT: v_cndmask_b32_e64 v13, -1, v26, s[6:7]
; HSA-NEXT: v_cndmask_b32_e32 v6, -1, v12, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; HSA-NEXT: v_cndmask_b32_e64 v12, -1, v24, s[4:5]
; HSA-NEXT: v_cndmask_b32_e32 v7, -1, v14, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; HSA-NEXT: v_cndmask_b32_e64 v14, -1, v28, s[8:9]
; HSA-NEXT: v_cndmask_b32_e32 v8, -1, v16, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
; HSA-NEXT: v_cndmask_b32_e32 v9, -1, v18, vcc
@@ -968,6 +960,12 @@ define <16 x ptr addrspace(3)> @addrspacecast_v16p0_to_v16p3(<16 x ptr> %ptr) {
; HSA-NEXT: v_cndmask_b32_e32 v10, -1, v20, vcc
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23]
; HSA-NEXT: v_cndmask_b32_e32 v11, -1, v22, vcc
+; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25]
+; HSA-NEXT: v_cndmask_b32_e32 v12, -1, v24, vcc
+; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[26:27]
+; HSA-NEXT: v_cndmask_b32_e32 v13, -1, v26, vcc
+; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[28:29]
+; HSA-NEXT: v_cndmask_b32_e32 v14, -1, v28, vcc
; HSA-NEXT: s_waitcnt vmcnt(0)
; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[30:31]
; HSA-NEXT: v_cndmask_b32_e32 v15, -1, v30, vcc
@@ -1187,65 +1185,64 @@ define <16 x ptr> @addrspacecast_v16p3_to_v16p0(<16 x ptr addrspace(3)> %ptr) {
; CI-NEXT: s_load_dword s4, s[6:7], 0x10
; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6
-; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7
+; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5
+; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v31, s4
-; CI-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc
+; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc
; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1
-; CI-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc
-; CI-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc
+; CI-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc
+; CI-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc
; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2
-; CI-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc
-; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc
+; CI-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc
+; CI-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc
; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3
-; CI-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc
-; CI-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc
-; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4
-; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5
-; CI-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc
-; CI-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5]
-; CI-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7]
-; CI-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9]
-; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8
-; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9
-; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10
-; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11
-; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12
-; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13
-; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14
-; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15
-; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11]
-; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13]
-; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15]
-; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17]
-; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19]
-; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21]
-; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23]
-; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25]
-; CI-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc
-; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5]
-; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7]
-; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9]
-; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11]
-; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13]
-; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15]
-; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17]
-; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19]
-; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21]
-; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23]
-; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25]
-; CI-NEXT: v_mov_b32_e32 v1, v48
-; CI-NEXT: v_mov_b32_e32 v2, v35
-; CI-NEXT: v_mov_b32_e32 v3, v33
-; CI-NEXT: v_mov_b32_e32 v4, v36
-; CI-NEXT: v_mov_b32_e32 v5, v49
-; CI-NEXT: v_mov_b32_e32 v6, v37
-; CI-NEXT: v_mov_b32_e32 v7, v34
-; CI-NEXT: v_mov_b32_e32 v8, v38
-; CI-NEXT: v_mov_b32_e32 v10, v50
-; CI-NEXT: v_mov_b32_e32 v12, v39
-; CI-NEXT: v_mov_b32_e32 v14, v32
+; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4
+; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7
+; CI-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc
+; CI-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5]
+; CI-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7]
+; CI-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9]
+; CI-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11]
+; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8
+; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9
+; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10
+; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11
+; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12
+; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13
+; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14
+; CI-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15
+; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13]
+; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15]
+; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17]
+; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19]
+; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21]
+; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23]
+; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25]
+; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27]
+; CI-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc
+; CI-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5]
+; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7]
+; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9]
+; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11]
+; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13]
+; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15]
+; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17]
+; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19]
+; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21]
+; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23]
+; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25]
+; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27]
+; CI-NEXT: v_mov_b32_e32 v1, v49
+; CI-NEXT: v_mov_b32_e32 v2, v34
+; CI-NEXT: v_mov_b32_e32 v3, v39
+; CI-NEXT: v_mov_b32_e32 v4, v35
+; CI-NEXT: v_mov_b32_e32 v5, v32
+; CI-NEXT: v_mov_b32_e32 v6, v36
+; CI-NEXT: v_mov_b32_e32 v8, v48
+; CI-NEXT: v_mov_b32_e32 v10, v37
+; CI-NEXT: v_mov_b32_e32 v12, v33
+; CI-NEXT: v_mov_b32_e32 v14, v38
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: addrspacecast_v16p3_to_v16p0:
@@ -1255,63 +1252,62 @@ define <16 x ptr> @addrspacecast_v16p3_to_v16p0(<16 x ptr addrspace(3)> %ptr) {
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
; GFX9-NEXT: v_mov_b32_e32 v31, s5
; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15
-; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13]
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15]
-; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17]
-; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19]
-; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21]
-; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23]
-; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25]
-; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9]
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13]
-; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15]
-; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17]
-; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19]
-; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21]
-; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23]
-; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25]
-; GFX9-NEXT: v_mov_b32_e32 v1, v48
-; GFX9-NEXT: v_mov_b32_e32 v2, v35
-; GFX9-NEXT: v_mov_b32_e32 v3, v33
-; GFX9-NEXT: v_mov_b32_e32 v4, v36
-; GFX9-NEXT: v_mov_b32_e32 v5, v49
-; GFX9-NEXT: v_mov_b32_e32 v6, v37
-; GFX9-NEXT: v_mov_b32_e32 v7, v34
-; GFX9-NEXT: v_mov_b32_e32 v8, v38
-; GFX9-NEXT: v_mov_b32_e32 v10, v50
-; GFX9-NEXT: v_mov_b32_e32 v12, v39
-; GFX9-NEXT: v_mov_b32_e32 v14, v32
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15
+; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13]
+; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17]
+; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21]
+; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23]
+; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25]
+; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13]
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17]
+; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21]
+; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23]
+; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25]
+; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27]
+; GFX9-NEXT: v_mov_b32_e32 v1, v49
+; GFX9-NEXT: v_mov_b32_e32 v2, v34
+; GFX9-NEXT: v_mov_b32_e32 v3, v39
+; GFX9-NEXT: v_mov_b32_e32 v4, v35
+; GFX9-NEXT: v_mov_b32_e32 v5, v32
+; GFX9-NEXT: v_mov_b32_e32 v6, v36
+; GFX9-NEXT: v_mov_b32_e32 v8, v48
+; GFX9-NEXT: v_mov_b32_e32 v10, v37
+; GFX9-NEXT: v_mov_b32_e32 v12, v33
+; GFX9-NEXT: v_mov_b32_e32 v14, v38
; GFX9-NEXT: s_setpc_b64 s[30:31]
%cast = addrspacecast <16 x ptr addrspace(3)> %ptr to <16 x ptr>
ret <16 x ptr> %cast
@@ -1550,13 +1546,9 @@ define <16 x ptr> @addrspacecast_v16p6_to_v16p0(<16 x ptr addrspace(6)> %ptr) {
; HSA-LABEL: addrspacecast_v16p6_to_v16p0:
; HSA: ; %bb.0:
; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HSA-NEXT: v_mov_b32_e32 v30, v15
; HSA-NEXT: v_mov_b32_e32 v28, v14
-; HSA-NEXT: v_mov_b32_e32 v26, v13
; HSA-NEXT: v_mov_b32_e32 v24, v12
-; HSA-NEXT: v_mov_b32_e32 v22, v11
; HSA-NEXT: v_mov_b32_e32 v20, v10
-; HSA-NEXT: v_mov_b32_e32 v18, v9
; HSA-NEXT: v_mov_b32_e32 v16, v8
; HSA-NEXT: v_mov_b32_e32 v14, v7
; HSA-NEXT: v_mov_b32_e32 v12, v6
@@ -1569,6 +1561,10 @@ define <16 x ptr> @addrspacecast_v16p6_to_v16p0(<16 x ptr addrspace(6)> %ptr) {
; HSA-NEXT: v_mov_b32_e32 v3, 0
; HSA-NEXT: v_mov_b32_e32 v5, 0
; HSA-NEXT: v_mov_b32_e32 v7, 0
+; HSA-NEXT: v_mov_b32_e32 v18, v9
+; HSA-NEXT: v_mov_b32_e32 v22, v11
+; HSA-NEXT: v_mov_b32_e32 v26, v13
+; HSA-NEXT: v_mov_b32_e32 v30, v15
; HSA-NEXT: v_mov_b32_e32 v9, 0
; HSA-NEXT: v_mov_b32_e32 v11, 0
; HSA-NEXT: v_mov_b32_e32 v13, 0
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 823db84a053b8c..58bb4ef5789ec2 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -104,13 +104,12 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v39, a1
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a16, v39
; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v39 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v32
; GFX908-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse
@@ -366,9 +365,6 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 {
ret void
}
-; FIXME: This case is broken. The asm value passed in v32 is live
-; through the range where the reserved def for the copy is introduced,
-; clobbering the user value.
define void @v32_asm_def_use(float %v0, float %v1) #0 {
; GFX908-LABEL: v32_asm_def_use:
; GFX908: ; %bb.0:
@@ -378,57 +374,48 @@ define void @v32_asm_def_use(float %v0, float %v1) #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def v[0:31] a[0:15]
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a15
-; GFX908-NEXT: ;;#ASMSTART
-; GFX908-NEXT: ; def v32
-; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a31, v35
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a15
; GFX908-NEXT: v_accvgpr_read_b32 v35, a14
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v32
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a13
; GFX908-NEXT: v_accvgpr_write_b32 a30, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a13
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a29, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a12
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a28, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a11
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v32
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a12
; GFX908-NEXT: v_accvgpr_write_b32 a27, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a10
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a26, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a9
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a25, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a8
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v32
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a10
; GFX908-NEXT: v_accvgpr_write_b32 a24, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a23, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a6
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a22, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a5
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v32
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a9
; GFX908-NEXT: v_accvgpr_write_b32 a21, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a2
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v32
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a7
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v35
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v32
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a6
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a20, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a3
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v32
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a4
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a19, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a2
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v32
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a3
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a18, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a1
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v32
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a1
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a17, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v32
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a0
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a16, v35
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v32
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def v32
+; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
@@ -1002,13 +989,12 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v39, a1
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a32, v39
; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v39 ; Reload Reuse
; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX908-NEXT: v_accvgpr_write_b32 a32, v33
; GFX908-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index a6d8c6f41eee59..3e19ee5567929c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -2,8 +2,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE
; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
-; TRAP-HANDLER-ENABLE: NumSgprs: 77
-; TRAP-HANDLER-DISABLE: NumSgprs: 92
+; TRAP-HANDLER-ENABLE: NumSgprs: 61
+; TRAP-HANDLER-DISABLE: NumSgprs: 77
define amdgpu_kernel void @amdhsa_trap_num_sgprs(
ptr addrspace(1) %out0, i32 %in0,
ptr addrspace(1) %out1, i32 %in1,
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index bc359d6ff3aaa0..4ccf92e68c8354 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -662,14 +662,12 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0
; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0
-; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0
-; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0
; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0
-; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x58, v0
-; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
@@ -677,9 +675,9 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
-; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
-; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen
@@ -687,60 +685,63 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
-; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0
-; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0
-; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x48, v0
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v4, v17, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 64, v0
+; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0
; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0
-; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0
-; GCN-NEXT: v_add_i32_e32 v22, vcc, 48, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v3, v11, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0
; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 44, v0
-; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 40, v0
-; GCN-NEXT: buffer_store_dword v4, v21, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0
+; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0
+; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0
-; GCN-NEXT: buffer_store_dword v3, v22, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 40, v0
+; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 36, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v3, vcc, 32, v0
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0
-; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0
; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0
-; GCN-NEXT: v_add_i32_e32 v20, vcc, 16, v0
-; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 12, v0
-; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0
-; GCN-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v0
-; GCN-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0
+; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 12, v0
+; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0
+; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GCN-NEXT: s_waitcnt vmcnt(8)
-; GCN-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v17, v6, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -758,14 +759,6 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0
; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0
-; GFX7-NEXT: v_add_i32_e32 v20, vcc, 48, v0
-; GFX7-NEXT: v_add_i32_e32 v21, vcc, 44, v0
-; GFX7-NEXT: v_add_i32_e32 v22, vcc, 40, v0
-; GFX7-NEXT: v_add_i32_e32 v23, vcc, 36, v0
-; GFX7-NEXT: v_add_i32_e32 v24, vcc, 32, v0
-; GFX7-NEXT: v_add_i32_e32 v25, vcc, 28, v0
-; GFX7-NEXT: v_add_i32_e32 v26, vcc, 24, v0
-; GFX7-NEXT: v_add_i32_e32 v27, vcc, 20, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
@@ -809,26 +802,34 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 16, v0
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0
; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 8, v0
-; GFX7-NEXT: buffer_store_dword v3, v20, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 40, v0
+; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 28, v0
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 24, v0
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 20, v0
; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v9, v22, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v7, v24, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
+; GFX7-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v0
+; GFX7-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0
+; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: buffer_store_dword v14, v25, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(12)
-; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -1335,83 +1336,83 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-LABEL: v_store_global_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16
+; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5
; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16
-; GCN-NEXT: v_alignbit_b32 v4, v31, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v13, v0, v1, 16
-; GCN-NEXT: v_alignbit_b32 v12, v6, v7, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v11, v0, v1, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v10, v0, v1, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v20
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v9, v0, v1, 16
-; GCN-NEXT: v_alignbit_b32 v8, v6, v7, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v7, v0, v1, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_alignbit_b32 v6, v0, v1, 16
-; GCN-NEXT: v_alignbit_b32 v16, v16, v14, 16
-; GCN-NEXT: v_alignbit_b32 v15, v15, v17, 16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v24
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v14, v0, v14, 16
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v30
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16
+; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16
+; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16
+; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16
+; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16
+; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
-; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: s_waitcnt expcnt(1)
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v17
-; GCN-NEXT: v_alignbit_b32 v17, v6, v18, 16
-; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26
+; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1421,78 +1422,78 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5
; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
-; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
-; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_alignbit_b32 v11, v7, v10, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v30
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27
+; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v28, v7, v6, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22
; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20
-; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20
; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17
+; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_alignbit_b32 v4, v31, v4, 16
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28
-; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26
-; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_alignbit_b32 v14, v14, v18, 16
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
@@ -1564,207 +1565,203 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v16
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132
-; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[16:17], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13
; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16
-; GCN-NEXT: v_alignbit_b32 v12, v18, v12, 16
+; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16
-; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
+; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
+; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16
+; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16
+; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16
+; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132
+; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120
-; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: s_waitcnt vmcnt(13)
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: s_waitcnt vmcnt(12)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: s_waitcnt vmcnt(11)
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: s_waitcnt vmcnt(10)
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: s_waitcnt vmcnt(7)
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12
+; GCN-NEXT: s_waitcnt vmcnt(4)
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108
-; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16
-; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16
-; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:112
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
-; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16
+; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16
+; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16
+; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16
+; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16
+; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16
+; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16
+; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(7)
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: s_waitcnt vmcnt(4)
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76
-; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16
-; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
-; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:96
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GCN-NEXT: v_alignbit_b32 v3, v1, v6, 16
-; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12
-; GCN-NEXT: v_alignbit_b32 v1, v1, v13, 16
-; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v14
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16
-; GCN-NEXT: v_alignbit_b32 v6, v5, v19, 16
-; GCN-NEXT: v_alignbit_b32 v5, v13, v21, 16
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v22
-; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64
-; GCN-NEXT: s_waitcnt vmcnt(9)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: s_waitcnt vmcnt(8)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21
+; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16
+; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16
+; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16
+; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48
; GCN-NEXT: s_waitcnt vmcnt(7)
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v7
-; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v10
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v7
-; GCN-NEXT: v_alignbit_b32 v7, v8, v15, 16
-; GCN-NEXT: v_alignbit_b32 v11, v9, v20, 16
-; GCN-NEXT: v_alignbit_b32 v10, v21, v10, 16
-; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56
-; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23
+; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v8, v8, v14, 16
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16
; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT: v_alignbit_b32 v15, v14, v15, 16
-; GCN-NEXT: v_alignbit_b32 v14, v19, v12, 16
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
-; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_alignbit_b32 v12, v12, v18, 16
-; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:80
-; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:64
-; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -1780,24 +1777,27 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104
; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: s_waitcnt vmcnt(7)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
@@ -1832,16 +1832,97 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39
+; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48
+; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
+; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68
+; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64
+; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60
+; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56
+; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52
+; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:48
+; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
+; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
+; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
+; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
+; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
+; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
+; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
+; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24
+; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20
+; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
+; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
+; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
+; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
+; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
+; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
+; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
+; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
+; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64
; GFX7-NEXT: s_nop 0
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5
; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
-; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72
-; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
-; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32
; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
@@ -1852,124 +1933,39 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22
+; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21
; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v21
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v20
-; GFX7-NEXT: v_alignbit_b32 v8, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18
+; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16
; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v37
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v28
-; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v38
-; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v39
-; GFX7-NEXT: v_alignbit_b32 v36, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v49
-; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v48
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v50
-; GFX7-NEXT: v_alignbit_b32 v35, v18, v19, 16
-; GFX7-NEXT: v_alignbit_b32 v34, v0, v1, 16
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
-; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28
-; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24
-; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:16
-; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v33, v6, v14, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16
-; GFX7-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v15
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30
-; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v29
-; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16
-; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27
-; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26
-; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44
-; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:8
-; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
-; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56
-; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40
-; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25
-; GFX7-NEXT: v_alignbit_b32 v16, v16, v20, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v24
-; GFX7-NEXT: v_alignbit_b32 v14, v14, v20, 16
-; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_alignbit_b32 v21, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(13)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v18
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(12)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v19
-; GFX7-NEXT: v_alignbit_b32 v20, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(11)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(10)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v23
-; GFX7-NEXT: v_alignbit_b32 v19, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v35
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v29
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16
+; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
-; GFX7-NEXT: v_alignbit_b32 v18, v0, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v28
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v33
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v34
-; GFX7-NEXT: v_alignbit_b32 v25, v0, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27
-; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26
-; GFX7-NEXT: v_alignbit_b32 v24, v22, v23, 16
-; GFX7-NEXT: v_alignbit_b32 v23, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v36
+; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16
+; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v37
-; GFX7-NEXT: v_alignbit_b32 v22, v0, v1, 16
-; GFX7-NEXT: buffer_store_dwordx4 v[22:25], v[31:32], s[4:7], 0 addr64 offset:80
-; GFX7-NEXT: buffer_store_dwordx4 v[18:21], v[31:32], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24
+; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16
; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16
@@ -4880,12 +4876,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v21, s30, 0
-; GCN-NEXT: v_writelane_b32 v21, s31, 1
+; GCN-NEXT: v_writelane_b32 v20, s30, 0
+; GCN-NEXT: v_writelane_b32 v20, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
@@ -4911,36 +4907,36 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16
; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16
; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16
-; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v16
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v15, vcc, 22, v16
-; GCN-NEXT: v_add_i32_e32 v17, vcc, 20, v16
+; GCN-NEXT: v_add_i32_e32 v15, vcc, 24, v16
+; GCN-NEXT: v_add_i32_e32 v17, vcc, 22, v16
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v14, vcc, 18, v16
-; GCN-NEXT: v_add_i32_e32 v18, vcc, 16, v16
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 20, v16
+; GCN-NEXT: v_add_i32_e32 v18, vcc, 18, v16
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v13, vcc, 14, v16
-; GCN-NEXT: v_add_i32_e32 v19, vcc, 12, v16
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 16, v16
+; GCN-NEXT: v_add_i32_e32 v19, vcc, 14, v16
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: buffer_store_short v12, v20, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v12, v15, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v12, vcc, 10, v16
-; GCN-NEXT: v_add_i32_e32 v20, vcc, 8, v16
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v16
+; GCN-NEXT: v_add_i32_e32 v15, vcc, 10, v16
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: buffer_store_short v11, v15, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v11, v17, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v11, vcc, 6, v16
-; GCN-NEXT: v_add_i32_e32 v15, vcc, 4, v16
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v16
+; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: buffer_store_short v10, v17, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v10, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v10, vcc, 2, v16
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v16
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 2, v16
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -4951,30 +4947,30 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: buffer_store_short v9, v14, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v9, v18, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v8, v18, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v8, v13, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v7, v13, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v7, v19, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v6, v19, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v6, v12, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v5, v12, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v4, v20, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v4, v11, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v3, v11, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v2, v15, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v1, v10, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v21, 1
-; GCN-NEXT: v_readlane_b32 s30, v21, 0
+; GCN-NEXT: v_readlane_b32 s31, v20, 1
+; GCN-NEXT: v_readlane_b32 s30, v20, 0
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
@@ -5365,10 +5361,10 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0
; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0
; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
@@ -5587,20 +5583,20 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:8
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
@@ -5617,11 +5613,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(18)
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: s_waitcnt vmcnt(18)
-; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT: s_waitcnt vmcnt(25)
+; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT: s_waitcnt vmcnt(25)
+; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:116
; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -7618,197 +7614,197 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
; GCN-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
; GCN-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
-; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50
-; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:52
-; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:54
-; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:56
-; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:58
-; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:60
-; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:62
+; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50
+; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52
+; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54
+; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56
+; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58
+; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60
+; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62
; GCN-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
-; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:40
-; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:42
+; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
+; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
; GCN-NEXT: s_waitcnt vmcnt(8)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32
-; GCN-NEXT: v_add_i32_e32 v32, vcc, 0xfc, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xfc, v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xf4, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xf4, v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xec, v0
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xec, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xe4, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xe4, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xdc, v0
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xd8, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xdc, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd8, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28
-; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd4, v0
-; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xd0, v0
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xcc, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc8, v0
-; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xc4, v0
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xc0, v0
+; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xd4, v0
+; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xd0, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xcc, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xbc, v0
+; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xc8, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xb8, v0
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xb4, v0
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xc4, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc0, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb0, v0
-; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xac, v0
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xa8, v0
-; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xbc, v0
+; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb8, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb4, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xa4, v0
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 0xb0, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa0, v0
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x9c, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xac, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa8, v0
+; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x98, v0
+; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xa4, v0
; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x94, v0
-; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x90, v0
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xa0, v0
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x9c, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x8c, v0
-; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x88, v0
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x84, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x98, v0
+; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x94, v0
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x90, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x80, v0
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x8c, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x88, v0
+; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x84, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31
+; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x80, v0
+; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x7c, v0
+; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0
-; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x70, v0
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0
+; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x70, v0
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x6c, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x68, v0
-; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x64, v0
-; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0
+; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v0
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x5c, v0
-; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x58, v0
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0
+; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x58, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0
-; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0
+; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x4c, v0
+; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x44, v0
-; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 64, v0
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0
+; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0
+; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0
-; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 48, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 44, v0
+; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 44, v0
; GCN-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0
; GCN-NEXT: v_add_i32_e32 v33, vcc, 36, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 32, v0
-; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0
+; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0
+; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0
; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0
-; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 16, v0
+; GCN-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0
; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 8, v0
+; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0
; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v19, vcc, 4, v0
; GCN-NEXT: s_waitcnt expcnt(0)
@@ -7824,34 +7820,34 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v8
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v11
-; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v10
-; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v9
; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v12
-; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v13
+; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v36
; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
-; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v36
-; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen
+; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v13
+; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v14
; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v15
; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v16
-; GCN-NEXT: buffer_store_dword v6, v27, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v17, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v12, v31, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v11, v21, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v10, v29, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v9, v21, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v15, v25, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v15, v23, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v14, v30, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v13, v34, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v3, v24, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v9, v28, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v19, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -7864,258 +7860,258 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:62
-; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:60
-; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:58
-; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:56
-; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:54
-; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:52
-; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:50
-; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:34
-; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:36
-; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:38
-; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
-; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
-; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
-; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
-; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:2
-; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:6
-; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:8
-; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:10
-; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:12
+; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:62
+; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:60
+; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:58
+; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:56
+; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:54
+; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52
+; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50
+; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34
+; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36
+; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38
+; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40
+; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42
+; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44
+; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46
+; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2
+; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6
+; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:8
+; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
+; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:12
; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14
-; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:18
-; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:20
-; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:22
-; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:24
-; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:26
-; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:28
-; GFX7-NEXT: buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 offset:30
+; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:18
+; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:20
+; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:22
+; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24
+; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26
+; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28
+; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30
; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v20
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xfc, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xfc, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf4, v0
-; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0xd8, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v18
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xf4, v0
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd8, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v23
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xec, v0
-; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v19
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xec, v0
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0xd4, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v24
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe4, v0
-; GFX7-NEXT: v_add_i32_e32 v24, vcc, 0xd0, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v20
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xe4, v0
+; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xd0, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v25
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xdc, v0
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v26
-; GFX7-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21
-; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v27
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xd4, v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v20, v24, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v28
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xcc, v0
-; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc8, v0
-; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc4, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v34
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v21
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v17
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xdc, v0
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22
+; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v23
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GFX7-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v20, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xcc, v0
+; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xc8, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xc4, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v31
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xbc, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v33
-; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb8, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v32
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb4, v0
-; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xbc, v0
+; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v30
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xb8, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xb4, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v29
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb0, v0
-; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xac, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v31
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa8, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa4, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v30
-; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xac, v0
+; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v28
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xa8, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xa4, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v29
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x9c, v0
-; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x98, v0
-; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x94, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0
+; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v26
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x98, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x94, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v25
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x90, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v18
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x8c, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v19
-; GFX7-NEXT: buffer_store_dword v21, v18, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x88, v0
-; GFX7-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v15
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x84, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v17
-; GFX7-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x80, v0
-; GFX7-NEXT: buffer_store_dword v20, v15, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0
+; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0
+; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v16
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x7c, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX7-NEXT: buffer_store_dword v2, v15, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x84, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v32
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x80, v0
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v14
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v16
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
-; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x74, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x70, v0
-; GFX7-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v12
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
-; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x6c, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v13, vcc, 0x68, v0
-; GFX7-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v10
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x64, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v34
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x7c, v0
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x74, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0
+; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v33
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x70, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x6c, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v8
-; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x5c, v0
-; GFX7-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: buffer_store_dword v16, v8, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX7-NEXT: buffer_store_dword v14, v19, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x68, v0
+; GFX7-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v12
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x64, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v12, vcc, 0x60, v0
+; GFX7-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x5c, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x58, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v7
+; GFX7-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0
+; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0
+; GFX7-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
+; GFX7-NEXT: buffer_store_dword v19, v7, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0
+; GFX7-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0
; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[5:6], v16
-; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x4c, v0
-; GFX7-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0
-; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v11
-; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
-; GFX7-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 64, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 60, v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v10
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 56, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 64, v0
; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 60, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; GFX7-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 56, v0
+; GFX7-NEXT: buffer_store_dword v19, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 52, v0
-; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0
; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0
+; GFX7-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 44, v0
-; GFX7-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0
; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0
+; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 36, v0
-; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 28, v0
-; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0
+; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0
+; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 16, v0
+; GFX7-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 12, v0
+; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v0
+; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GFX7-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
-; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 12, v0
-; GFX7-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0
-; GFX7-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v0
-; GFX7-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 2, v1
-; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 2, v1
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 6, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 8, v1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 6, v1
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 8, v1
+; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, 10, v1
; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v13, vcc, 12, v1
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 14, v1
-; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v21, vcc, 16, v1
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, 18, v1
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, 14, v1
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, 20, v1
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, 16, v1
; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 18, v1
+; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, 20, v1
+; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1
; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1
@@ -8126,469 +8122,473 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1
; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v33, vcc, 32, v1
+; GFX8-NEXT: v_add_u32_e32 v33, vcc, 34, v1
; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v35, vcc, 34, v1
+; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1
; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
-; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT: v_add_u32_e32 v37, vcc, 36, v1
-; GFX8-NEXT: flat_load_ushort v43, v[1:2]
+; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
+; GFX8-NEXT: flat_load_ushort v44, v[1:2]
; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v48, vcc, 38, v1
+; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1
; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v44, v[50:51]
+; GFX8-NEXT: flat_load_ushort v45, v[50:51]
; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v45, v[50:51]
-; GFX8-NEXT: v_add_u32_e32 v50, vcc, 40, v1
+; GFX8-NEXT: flat_load_ushort v46, v[50:51]
+; GFX8-NEXT: v_add_u32_e32 v50, vcc, 42, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1
; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v46, v[52:53]
-; GFX8-NEXT: v_add_u32_e32 v52, vcc, 42, v1
+; GFX8-NEXT: flat_load_ushort v47, v[52:53]
+; GFX8-NEXT: v_add_u32_e32 v52, vcc, 44, v1
; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1
; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v47, v[54:55]
-; GFX8-NEXT: v_add_u32_e32 v54, vcc, 44, v1
+; GFX8-NEXT: flat_load_ushort v56, v[54:55]
+; GFX8-NEXT: v_add_u32_e32 v54, vcc, 46, v1
; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1
; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v56, v[39:40]
-; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1
-; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v57, v[39:40]
-; GFX8-NEXT: v_add_u32_e32 v39, vcc, 46, v1
+; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1
; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v41, vcc, 50, v1
-; GFX8-NEXT: v_addc_u32_e32 v42, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v41, v[41:42]
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v1
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v42, v[9:10]
-; GFX8-NEXT: flat_load_ushort v9, v[35:36]
-; GFX8-NEXT: flat_load_ushort v10, v[37:38]
-; GFX8-NEXT: flat_load_ushort v35, v[48:49]
-; GFX8-NEXT: flat_load_ushort v36, v[50:51]
-; GFX8-NEXT: flat_load_ushort v37, v[52:53]
-; GFX8-NEXT: flat_load_ushort v48, v[54:55]
-; GFX8-NEXT: flat_load_ushort v39, v[39:40]
-; GFX8-NEXT: flat_load_ushort v49, v[1:2]
-; GFX8-NEXT: flat_load_ushort v50, v[3:4]
-; GFX8-NEXT: flat_load_ushort v51, v[5:6]
-; GFX8-NEXT: flat_load_ushort v52, v[7:8]
-; GFX8-NEXT: flat_load_ushort v53, v[11:12]
-; GFX8-NEXT: flat_load_ushort v38, v[13:14]
-; GFX8-NEXT: flat_load_ushort v14, v[17:18]
-; GFX8-NEXT: flat_load_ushort v11, v[21:22]
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v0
-; GFX8-NEXT: flat_load_ushort v15, v[15:16]
-; GFX8-NEXT: flat_load_ushort v13, v[19:20]
-; GFX8-NEXT: flat_load_ushort v8, v[23:24]
-; GFX8-NEXT: flat_load_ushort v6, v[25:26]
-; GFX8-NEXT: flat_load_ushort v5, v[27:28]
-; GFX8-NEXT: flat_load_ushort v7, v[29:30]
-; GFX8-NEXT: flat_load_ushort v12, v[31:32]
-; GFX8-NEXT: flat_load_ushort v16, v[33:34]
-; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xc4, v0
-; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xbc, v0
-; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xb4, v0
-; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xac, v0
-; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0xa4, v0
-; GFX8-NEXT: v_add_u32_e32 v27, vcc, 0x9c, v0
+; GFX8-NEXT: flat_load_ushort v58, v[39:40]
+; GFX8-NEXT: v_add_u32_e32 v40, vcc, 48, v1
+; GFX8-NEXT: v_addc_u32_e32 v41, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1
+; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc
+; GFX8-NEXT: flat_load_ushort v42, v[42:43]
+; GFX8-NEXT: flat_load_ushort v34, v[33:34]
+; GFX8-NEXT: flat_load_ushort v36, v[35:36]
+; GFX8-NEXT: flat_load_ushort v38, v[37:38]
+; GFX8-NEXT: flat_load_ushort v39, v[48:49]
+; GFX8-NEXT: flat_load_ushort v48, v[50:51]
+; GFX8-NEXT: flat_load_ushort v51, v[52:53]
+; GFX8-NEXT: flat_load_ushort v52, v[54:55]
+; GFX8-NEXT: flat_load_ushort v53, v[40:41]
+; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1
+; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc
+; GFX8-NEXT: flat_load_ushort v37, v[3:4]
+; GFX8-NEXT: flat_load_ushort v35, v[5:6]
+; GFX8-NEXT: flat_load_ushort v33, v[7:8]
+; GFX8-NEXT: flat_load_ushort v8, v[9:10]
+; GFX8-NEXT: flat_load_ushort v6, v[11:12]
+; GFX8-NEXT: flat_load_ushort v4, v[13:14]
+; GFX8-NEXT: flat_load_ushort v2, v[15:16]
+; GFX8-NEXT: flat_load_ushort v1, v[19:20]
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 4, v0
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0
; GFX8-NEXT: s_waitcnt vmcnt(14)
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v43
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfc, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v44
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v45
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf8, v0
-; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf4, v0
-; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v46
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xf0, v0
-; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xec, v0
-; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xe8, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v47
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe4, v0
-; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe0, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v56
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xdc, v0
-; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v57
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xd8, v0
-; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xd4, v0
-; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xd0, v0
-; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v41
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xcc, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v42
-; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xc8, v0
-; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v49
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v50
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3
+; GFX8-NEXT: flat_load_ushort v3, v[17:18]
+; GFX8-NEXT: flat_load_ushort v5, v[21:22]
+; GFX8-NEXT: flat_load_ushort v7, v[23:24]
+; GFX8-NEXT: flat_load_ushort v9, v[25:26]
+; GFX8-NEXT: flat_load_ushort v10, v[27:28]
+; GFX8-NEXT: flat_load_ushort v11, v[29:30]
+; GFX8-NEXT: flat_load_ushort v12, v[31:32]
+; GFX8-NEXT: flat_load_ushort v13, v[49:50]
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0
+; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfc, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v45
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v46
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf8, v0
+; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf4, v0
+; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v47
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xf0, v0
+; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xec, v0
+; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v56
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe4, v0
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v57
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xdc, v0
+; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v58
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd8, v0
+; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xd4, v0
+; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v42
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd0, v0
+; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xcc, v0
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(14)
-; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v51
-; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v52
-; GFX8-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xc0, v0
-; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v39
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v53
-; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v38
-; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xb8, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0
+; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc4, v0
+; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v52
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xc0, v0
+; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xbc, v0
+; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0
+; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb0, v0
+; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xac, v0
+; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v39
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa8, v0
+; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xa4, v0
+; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v38
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa0, v0
+; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x9c, v0
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x98, v0
+; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x94, v0
+; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x90, v0
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v34
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x8c, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37
+; GFX8-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x88, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v35
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v48
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xb0, v0
-; GFX8-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v37
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xa8, v0
-; GFX8-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v23
-; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v36
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v23
-; GFX8-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xa0, v0
-; GFX8-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v25
-; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v35
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v25
-; GFX8-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v10
-; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0x98, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x94, v0
-; GFX8-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x90, v0
-; GFX8-NEXT: buffer_store_dword v27, v11, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v14
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x8c, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v15
-; GFX8-NEXT: buffer_store_dword v28, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x88, v0
-; GFX8-NEXT: buffer_store_dword v27, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v16
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v13
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x84, v0
-; GFX8-NEXT: buffer_store_dword v28, v13, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x80, v0
-; GFX8-NEXT: buffer_store_dword v27, v13, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v9
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7c, v0
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX8-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x78, v0
-; GFX8-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x74, v0
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX8-NEXT: buffer_store_dword v7, v13, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x70, v0
-; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x80, v0
+; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x6c, v0
-; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x68, v0
-; GFX8-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x64, v0
-; GFX8-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x60, v0
-; GFX8-NEXT: buffer_store_dword v12, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x5c, v0
-; GFX8-NEXT: buffer_store_dword v9, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x58, v0
-; GFX8-NEXT: buffer_store_dword v8, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x78, v0
+; GFX8-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v18
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v11
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x74, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x70, v0
+; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v8
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x6c, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x68, v0
+; GFX8-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x60, v0
+; GFX8-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0
+; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x58, v0
+; GFX8-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v2
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0
-; GFX8-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x50, v0
-; GFX8-NEXT: buffer_store_dword v27, v5, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
+; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0
-; GFX8-NEXT: buffer_store_dword v15, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x48, v0
-; GFX8-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x44, v0
-; GFX8-NEXT: buffer_store_dword v11, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 64, v0
-; GFX8-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 60, v0
-; GFX8-NEXT: buffer_store_dword v26, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 56, v0
-; GFX8-NEXT: buffer_store_dword v25, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 52, v0
-; GFX8-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 48, v0
-; GFX8-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 44, v0
-; GFX8-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 40, v0
-; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 36, v0
-; GFX8-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v0
-; GFX8-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 28, v0
-; GFX8-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 24, v0
-; GFX8-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 20, v0
-; GFX8-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0
-; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 12, v0
+; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
+; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0
+; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v0
+; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 60, v0
+; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0
+; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 52, v0
+; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v0
+; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 44, v0
+; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 40, v0
+; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 36, v0
+; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0
+; GFX8-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0
+; GFX8-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0
+; GFX8-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0
+; GFX8-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:62
-; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:60
-; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:58
-; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:56
-; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:54
-; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:52
-; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:50
-; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:48
-; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:46
-; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:44
-; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:42
-; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:40
-; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:38
-; GFX9-NEXT: global_load_ushort v19, v[1:2], off
-; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:36
-; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:2
-; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:4
-; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:34
-; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:32
-; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:6
-; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:8
-; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:30
+; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:62
+; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:60
+; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:58
+; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:56
+; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:54
+; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:52
+; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:50
+; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:48
+; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:46
+; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:44
+; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:42
+; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:40
+; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:38
+; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:36
+; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:34
+; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:32
+; GFX9-NEXT: global_load_ushort v25, v[1:2], off
+; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:2
+; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:30
; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16
; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18
; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20
; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22
-; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24
-; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:26
-; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:28
-; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:10
+; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:24
+; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26
+; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28
+; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4
+; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6
+; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8
+; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10
; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14
; GFX9-NEXT: s_waitcnt vmcnt(31)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v21
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10
; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v25
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:252
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:248
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:252
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:248
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11
; GFX9-NEXT: s_waitcnt vmcnt(29)
-; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v26
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:244
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:240
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:244
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:240
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v27
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:236
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:232
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v23
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:236
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:232
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
; GFX9-NEXT: s_waitcnt vmcnt(31)
-; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v29
-; GFX9-NEXT: s_waitcnt vmcnt(29)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v30
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v25
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v26
-; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:220
-; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:216
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v27
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[27:28], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:228
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:224
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
+; GFX9-NEXT: s_waitcnt vmcnt(31)
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v13
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:220
+; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:216
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v14
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
+; GFX9-NEXT: s_waitcnt vmcnt(32)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v18
+; GFX9-NEXT: s_waitcnt vmcnt(30)
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v20
; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19
-; GFX9-NEXT: s_waitcnt vmcnt(27)
-; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v20
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
-; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v31
-; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v32
-; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v33
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v34
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:212
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:208
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v29
-; GFX9-NEXT: s_waitcnt vmcnt(26)
-; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[29:30], v30
-; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:204
-; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:200
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v31
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[31:32], v32
-; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:196
-; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:192
-; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:188
-; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:184
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:180
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:172
-; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:168
-; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:164
-; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
-; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:156
-; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:152
-; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:212
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:208
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21
+; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:204
+; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:200
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:196
+; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:192
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v20
+; GFX9-NEXT: s_waitcnt vmcnt(33)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23
; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX9-NEXT: s_waitcnt vmcnt(39)
-; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v18
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v19
+; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:188
+; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:184
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:180
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:176
+; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:172
+; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:168
+; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164
+; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160
+; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:156
+; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:152
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:148
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:144
+; GFX9-NEXT: s_waitcnt vmcnt(44)
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v24
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:140
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:136
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
+; GFX9-NEXT: s_waitcnt vmcnt(43)
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v27
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:132
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
+; GFX9-NEXT: s_waitcnt vmcnt(38)
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v30
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v29
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v25
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v2
+; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v28
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2
; GFX9-NEXT: s_waitcnt vmcnt(40)
-; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v13
-; GFX9-NEXT: s_waitcnt vmcnt(39)
-; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v14
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v11
-; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:140
-; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:136
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v18
; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v2
+; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v2
; GFX9-NEXT: s_waitcnt vmcnt(40)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v15
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:132
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2
-; GFX9-NEXT: s_waitcnt vmcnt(34)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT: s_waitcnt vmcnt(38)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
-; GFX9-NEXT: s_waitcnt vmcnt(39)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT: s_waitcnt vmcnt(38)
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2
+; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v7
+; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88
; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v18
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v21
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v22
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v23
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v12
-; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen
+; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3
+; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v22
+; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -8612,179 +8612,177 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX10-NEXT: global_load_ushort v16, v[1:2], off offset:26
; GFX10-NEXT: global_load_ushort v17, v[1:2], off offset:28
; GFX10-NEXT: global_load_ushort v18, v[1:2], off offset:30
-; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:32
-; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:34
-; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:36
-; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:38
-; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:40
-; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:42
-; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:44
-; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:46
-; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:48
-; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:62
-; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:50
-; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:52
-; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:54
-; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:60
-; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:56
-; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:58
+; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:62
+; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:32
+; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:34
+; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:36
+; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:60
+; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:38
+; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:40
+; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:58
+; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:42
+; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:44
+; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:56
+; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:46
+; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:48
+; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:54
+; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:50
+; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:52
; GFX10-NEXT: s_waitcnt vmcnt(31)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v3
; GFX10-NEXT: s_waitcnt vmcnt(30)
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4
; GFX10-NEXT: s_waitcnt vmcnt(29)
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v5
; GFX10-NEXT: s_waitcnt vmcnt(28)
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v6
; GFX10-NEXT: s_waitcnt vmcnt(27)
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v7
; GFX10-NEXT: s_waitcnt vmcnt(26)
-; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v8
; GFX10-NEXT: s_waitcnt vmcnt(25)
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v9
; GFX10-NEXT: s_waitcnt vmcnt(24)
-; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
; GFX10-NEXT: s_waitcnt vmcnt(23)
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v11
; GFX10-NEXT: s_waitcnt vmcnt(22)
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v12
; GFX10-NEXT: s_waitcnt vmcnt(21)
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v13
; GFX10-NEXT: s_waitcnt vmcnt(20)
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v14
-; GFX10-NEXT: s_waitcnt vmcnt(19)
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v15
-; GFX10-NEXT: s_waitcnt vmcnt(18)
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v16
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v37
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v38
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v14
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v35
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v36
+; GFX10-NEXT: s_waitcnt vmcnt(17)
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v17
+; GFX10-NEXT: s_waitcnt vmcnt(16)
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v18
; GFX10-NEXT: s_waitcnt vmcnt(15)
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v19
; GFX10-NEXT: s_waitcnt vmcnt(14)
; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v20
; GFX10-NEXT: s_waitcnt vmcnt(13)
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v21
; GFX10-NEXT: s_waitcnt vmcnt(12)
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v22
; GFX10-NEXT: s_waitcnt vmcnt(11)
-; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v23
-; GFX10-NEXT: s_waitcnt vmcnt(10)
-; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v23
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX10-NEXT: s_waitcnt vmcnt(9)
-; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v25
; GFX10-NEXT: s_waitcnt vmcnt(8)
-; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v26
; GFX10-NEXT: s_waitcnt vmcnt(7)
-; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v27
-; GFX10-NEXT: s_waitcnt vmcnt(6)
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v27
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
; GFX10-NEXT: s_waitcnt vmcnt(5)
-; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v29
; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v30
-; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v31
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v30
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v32
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v32
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v33
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v34
-; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v33
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v29
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v84
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v13
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v50
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v51
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v82
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v52
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v53
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[52:53], v80
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v35
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v36
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v48
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v49
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v54
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v55
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[54:55], v70
-; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v34
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v31
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v24
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v71
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v68
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v16
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v70
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v15
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v83
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v17
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:244
-; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v81
-; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:236
-; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:232
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v71
-; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
-; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v65
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[64:65], v64
-; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:220
-; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:216
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v67
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[66:67], v66
-; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:212
-; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:208
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v69
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v39
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[68:69], v68
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v23
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v37
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v38
+; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244
+; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:240
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v25
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v66
+; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236
+; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:232
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v27
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v48
+; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228
+; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:224
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v81
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v49
+; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
+; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v80
+; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:212
+; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v69
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v64
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v50
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v51
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v54
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:204
; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
-; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:196
-; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:192
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v67
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v39
+; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:196
+; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:192
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v65
; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:188
; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:184
-; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:180
-; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:176
-; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:172
-; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:168
-; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:164
-; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:160
-; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:156
-; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:152
-; GFX10-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:148
-; GFX10-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:144
-; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
-; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:136
-; GFX10-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:132
-; GFX10-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v55
+; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:180
+; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:176
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v53
+; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:172
+; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:168
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v52
+; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164
+; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:160
+; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:156
+; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152
+; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
+; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
+; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140
+; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:136
+; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:132
+; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128
; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:124
; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:120
-; GFX10-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:112
+; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:116
+; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112
; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:108
; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:104
-; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:100
-; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:96
-; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:92
-; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:88
-; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:84
-; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:80
-; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:76
-; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:72
-; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:68
-; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:64
-; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:60
-; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:56
-; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:52
-; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:48
-; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:44
-; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:40
-; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:36
-; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32
-; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:28
-; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:24
-; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:20
-; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:16
-; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:12
-; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
-; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100
+; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96
+; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:92
+; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:88
+; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:84
+; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80
+; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:76
+; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72
+; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:68
+; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:64
+; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:60
+; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:56
+; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:52
+; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:48
+; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44
+; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v32bf16_to_v32f64:
@@ -10059,55 +10057,47 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_add_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_add_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_add_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_add_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_add_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_add_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_add_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_add_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_add_f32_e32 v11, v11, v27
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_add_f32_e32 v4, v4, v20
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -10116,6 +10106,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_add_f32_e32 v10, v10, v26
+; GCN-NEXT: v_add_f32_e32 v9, v9, v25
+; GCN-NEXT: v_add_f32_e32 v8, v8, v24
+; GCN-NEXT: v_add_f32_e32 v7, v7, v23
+; GCN-NEXT: v_add_f32_e32 v6, v6, v22
+; GCN-NEXT: v_add_f32_e32 v5, v5, v21
+; GCN-NEXT: v_add_f32_e32 v4, v4, v20
; GCN-NEXT: v_add_f32_e32 v3, v3, v19
; GCN-NEXT: v_add_f32_e32 v2, v2, v18
; GCN-NEXT: v_add_f32_e32 v1, v1, v17
@@ -10135,7 +10133,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_add_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -10145,20 +10143,22 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_fadd_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -10169,25 +10169,24 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -10212,7 +10211,6 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_add_f32_e32 v14, v14, v30
; GFX7-NEXT: v_add_f32_e32 v13, v13, v29
; GFX7-NEXT: v_add_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_add_f32_e32 v11, v11, v27
; GFX7-NEXT: v_add_f32_e32 v10, v10, v26
; GFX7-NEXT: v_add_f32_e32 v9, v9, v25
; GFX7-NEXT: v_add_f32_e32 v8, v8, v24
@@ -10231,7 +10229,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_add_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -11689,10 +11687,10 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -11995,278 +11993,278 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT: v_add_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_add_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_add_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
; GFX10-NEXT: v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX10-NEXT: v_add_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
+; GFX10-NEXT: v_add_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
+; GFX10-NEXT: v_add_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_add_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
+; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
+; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX10-NEXT: v_add_f32_e32 v27, v50, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_add_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
+; GFX10-NEXT: v_add_f32_e32 v29, v38, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
+; GFX10-NEXT: v_add_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX10-NEXT: v_add_f32_e32 v28, v48, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
+; GFX10-NEXT: v_add_f32_e32 v34, v34, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_add_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_add_f32_e32 v25, v54, v53
-; GFX10-NEXT: v_add_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_add_f32_e32 v24, v64, v55
-; GFX10-NEXT: v_add_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_add_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_add_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_add_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT: v_add_f32_e32 v30, v36, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_add_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_add_f32_e32 v18, v27, v48
+; GFX10-NEXT: v_add_f32_e32 v18, v48, v23
; GFX10-NEXT: v_add_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_add_f32_e32 v17, v26, v50
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_add_f32_e32 v17, v50, v22
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
+; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_add_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_add_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_add_f32_e32 v20, v36, v25
; GFX10-NEXT: v_add_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_add_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_add_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_add_f32_e32 v19, v38, v24
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX10-NEXT: v_add_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_add_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_add_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
+; GFX10-NEXT: v_add_f32_e32 v21, v51, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
+; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
+; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
+; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
+; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
+; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
+; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
+; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
+; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
+; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
+; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
+; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
+; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_add_f32_e32 v17, v31, v17
; GFX10-NEXT: v_add_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
+; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
+; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -14496,55 +14494,47 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_mul_f32_e32 v11, v11, v27
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v4, v4, v20
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -14553,6 +14543,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_mul_f32_e32 v10, v10, v26
+; GCN-NEXT: v_mul_f32_e32 v9, v9, v25
+; GCN-NEXT: v_mul_f32_e32 v8, v8, v24
+; GCN-NEXT: v_mul_f32_e32 v7, v7, v23
+; GCN-NEXT: v_mul_f32_e32 v6, v6, v22
+; GCN-NEXT: v_mul_f32_e32 v5, v5, v21
+; GCN-NEXT: v_mul_f32_e32 v4, v4, v20
; GCN-NEXT: v_mul_f32_e32 v3, v3, v19
; GCN-NEXT: v_mul_f32_e32 v2, v2, v18
; GCN-NEXT: v_mul_f32_e32 v1, v1, v17
@@ -14572,7 +14570,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_mul_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -14582,20 +14580,22 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_fmul_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -14606,25 +14606,24 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -14649,7 +14648,6 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30
; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29
; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27
; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26
; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25
; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24
@@ -14668,7 +14666,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -16126,10 +16124,10 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -16432,278 +16430,278 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
+; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
+; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
+; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
+; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX10-NEXT: v_mul_f32_e32 v27, v50, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
+; GFX10-NEXT: v_mul_f32_e32 v29, v38, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
+; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX10-NEXT: v_mul_f32_e32 v28, v48, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
+; GFX10-NEXT: v_mul_f32_e32 v34, v34, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_mul_f32_e32 v25, v54, v53
-; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_mul_f32_e32 v24, v64, v55
-; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_mul_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_mul_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT: v_mul_f32_e32 v30, v36, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_mul_f32_e32 v18, v27, v48
+; GFX10-NEXT: v_mul_f32_e32 v18, v48, v23
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_mul_f32_e32 v17, v26, v50
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_mul_f32_e32 v17, v50, v22
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
+; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_mul_f32_e32 v20, v36, v25
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_mul_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_mul_f32_e32 v19, v38, v24
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_mul_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_mul_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
+; GFX10-NEXT: v_mul_f32_e32 v21, v51, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
+; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
+; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
+; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
+; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
+; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
+; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
+; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
+; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
+; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
+; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
+; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
+; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_mul_f32_e32 v17, v31, v17
; GFX10-NEXT: v_mul_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
+; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
+; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -18574,55 +18572,47 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_min_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_min_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_min_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_min_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_min_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_min_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_min_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_min_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_min_f32_e32 v11, v11, v27
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_min_f32_e32 v4, v4, v20
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -18631,6 +18621,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_min_f32_e32 v10, v10, v26
+; GCN-NEXT: v_min_f32_e32 v9, v9, v25
+; GCN-NEXT: v_min_f32_e32 v8, v8, v24
+; GCN-NEXT: v_min_f32_e32 v7, v7, v23
+; GCN-NEXT: v_min_f32_e32 v6, v6, v22
+; GCN-NEXT: v_min_f32_e32 v5, v5, v21
+; GCN-NEXT: v_min_f32_e32 v4, v4, v20
; GCN-NEXT: v_min_f32_e32 v3, v3, v19
; GCN-NEXT: v_min_f32_e32 v2, v2, v18
; GCN-NEXT: v_min_f32_e32 v1, v1, v17
@@ -18650,7 +18648,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_min_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -18660,20 +18658,22 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_minnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -18684,25 +18684,24 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -18727,7 +18726,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
@@ -18746,7 +18744,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -20204,10 +20202,10 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -20510,278 +20508,278 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT: v_min_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_min_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
; GFX10-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX10-NEXT: v_min_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
+; GFX10-NEXT: v_min_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
+; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
+; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
+; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX10-NEXT: v_min_f32_e32 v27, v50, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_min_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
+; GFX10-NEXT: v_min_f32_e32 v29, v38, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
+; GFX10-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX10-NEXT: v_min_f32_e32 v28, v48, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
+; GFX10-NEXT: v_min_f32_e32 v34, v34, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_min_f32_e32 v25, v54, v53
-; GFX10-NEXT: v_min_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_min_f32_e32 v24, v64, v55
-; GFX10-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_min_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_min_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT: v_min_f32_e32 v30, v36, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_min_f32_e32 v18, v27, v48
+; GFX10-NEXT: v_min_f32_e32 v18, v48, v23
; GFX10-NEXT: v_min_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_min_f32_e32 v17, v26, v50
-; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_min_f32_e32 v17, v50, v22
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
+; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_min_f32_e32 v20, v36, v25
; GFX10-NEXT: v_min_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_min_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_min_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_min_f32_e32 v19, v38, v24
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX10-NEXT: v_min_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_min_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_min_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
+; GFX10-NEXT: v_min_f32_e32 v21, v51, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
+; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
+; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
+; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
+; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
+; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
+; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
+; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
+; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
+; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
+; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
+; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
+; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_min_f32_e32 v17, v31, v17
; GFX10-NEXT: v_min_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
+; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
+; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -22193,55 +22191,47 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_max_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_max_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_max_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_max_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_max_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_max_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_max_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_max_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_max_f32_e32 v11, v11, v27
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_max_f32_e32 v4, v4, v20
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -22250,6 +22240,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_max_f32_e32 v10, v10, v26
+; GCN-NEXT: v_max_f32_e32 v9, v9, v25
+; GCN-NEXT: v_max_f32_e32 v8, v8, v24
+; GCN-NEXT: v_max_f32_e32 v7, v7, v23
+; GCN-NEXT: v_max_f32_e32 v6, v6, v22
+; GCN-NEXT: v_max_f32_e32 v5, v5, v21
+; GCN-NEXT: v_max_f32_e32 v4, v4, v20
; GCN-NEXT: v_max_f32_e32 v3, v3, v19
; GCN-NEXT: v_max_f32_e32 v2, v2, v18
; GCN-NEXT: v_max_f32_e32 v1, v1, v17
@@ -22269,7 +22267,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_max_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -22279,20 +22277,22 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_maxnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -22303,25 +22303,24 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -22346,7 +22345,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
@@ -22365,7 +22363,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -23823,10 +23821,10 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -24129,278 +24127,278 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT: v_max_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_max_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
; GFX10-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX10-NEXT: v_max_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
+; GFX10-NEXT: v_max_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
+; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
+; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
+; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX10-NEXT: v_max_f32_e32 v27, v50, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
+; GFX10-NEXT: v_max_f32_e32 v29, v38, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX10-NEXT: v_max_f32_e32 v28, v48, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
+; GFX10-NEXT: v_max_f32_e32 v34, v34, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_max_f32_e32 v25, v54, v53
-; GFX10-NEXT: v_max_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_max_f32_e32 v24, v64, v55
-; GFX10-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_max_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_max_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT: v_max_f32_e32 v30, v36, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_max_f32_e32 v18, v27, v48
+; GFX10-NEXT: v_max_f32_e32 v18, v48, v23
; GFX10-NEXT: v_max_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_max_f32_e32 v17, v26, v50
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v17, v50, v22
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
+; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_max_f32_e32 v20, v36, v25
; GFX10-NEXT: v_max_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_max_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_max_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_max_f32_e32 v19, v38, v24
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX10-NEXT: v_max_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_max_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_max_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
+; GFX10-NEXT: v_max_f32_e32 v21, v51, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
+; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
+; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
+; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
+; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
+; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
+; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
+; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
+; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
+; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
+; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
+; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
+; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_max_f32_e32 v17, v31, v17
; GFX10-NEXT: v_max_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
+; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
+; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -35657,81 +35655,81 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GCN-LABEL: v_select_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v17
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v23
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16
+; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16
+; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16
+; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16
+; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16
; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; GCN-NEXT: v_alignbit_b32 v14, v19, v20, 16
+; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16
; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -35764,67 +35762,67 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22
+; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21
; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v17, 16
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23
; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_alignbit_b32 v10, v10, v17, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v28
-; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27
-; GFX7-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25
+; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30
; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30
+; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29
-; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
@@ -35833,21 +35831,21 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v18
-; GFX7-NEXT: v_alignbit_b32 v12, v12, v16, 16
-; GFX7-NEXT: v_cndmask_b32_e32 v15, v12, v15, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16
+; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -37134,30 +37132,30 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GCN-LABEL: v_vselect_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v7, 1, v7
+; GCN-NEXT: v_and_b32_e32 v6, 1, v6
+; GCN-NEXT: v_and_b32_e32 v5, 1, v5
+; GCN-NEXT: v_and_b32_e32 v4, 1, v4
+; GCN-NEXT: v_and_b32_e32 v3, 1, v3
+; GCN-NEXT: v_and_b32_e32 v2, 1, v2
+; GCN-NEXT: v_and_b32_e32 v1, 1, v1
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_and_b32_e32 v2, 1, v2
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_and_b32_e32 v3, 1, v3
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v5, 1, v5
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v6, 1, v6
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v7, 1, v7
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
@@ -37188,45 +37186,45 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v15, v14, vcc
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v14, v13, vcc
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX7-NEXT: v_cndmask_b32_e32 v4, v13, v12, vcc
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v18
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v17
+; GFX7-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v16
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v12, v9, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v11, v8, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
@@ -37495,16 +37493,16 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v1, 1, v10
; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1
-; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17
-; GCN-NEXT: v_and_b32_e32 v3, 1, v11
-; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v3
-; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18
-; GCN-NEXT: v_and_b32_e32 v5, 1, v12
-; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v5
+; GCN-NEXT: v_and_b32_e32 v2, 1, v11
+; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v2
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18
+; GCN-NEXT: v_and_b32_e32 v3, 1, v12
+; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v3
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v7, 1, v13
; GCN-NEXT: v_and_b32_e32 v8, 1, v14
; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7
@@ -37571,22 +37569,22 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13]
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[8:9]
-; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[8:9]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
@@ -37612,151 +37610,136 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX7-LABEL: v_vselect_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v8
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v7
+; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v15
+; GFX7-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v14
+; GFX7-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v13
+; GFX7-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v12
+; GFX7-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11
+; GFX7-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX7-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v9
+; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v2
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v3
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v4
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v5
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v6
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v7
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v8
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v9
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v10
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v11
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_and_b32_e32 v2, 1, v12
-; GFX7-NEXT: v_writelane_b32 v31, s30, 0
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_and_b32_e32 v3, 1, v13
-; GFX7-NEXT: v_writelane_b32 v31, s31, 1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_and_b32_e32 v4, 1, v14
-; GFX7-NEXT: v_writelane_b32 v31, s34, 2
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_and_b32_e32 v5, 1, v15
-; GFX7-NEXT: v_writelane_b32 v31, s35, 3
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5
-; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v15, v1, v0, s[34:35]
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v14, v2, v1, s[30:31]
-; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v29
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v13, v3, v2, s[28:29]
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v28
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v3, s[26:27]
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v27
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_cndmask_b32_e64 v11, v5, v4, s[24:25]
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v26
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[12:13]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_readlane_b32 s35, v31, 3
-; GFX7-NEXT: v_readlane_b32 s34, v31, 2
-; GFX7-NEXT: v_readlane_b32 s31, v31, 1
-; GFX7-NEXT: v_readlane_b32 s30, v31, 0
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v10, v0, v5, s[22:23]
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v25
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v9, v1, v5, s[20:21]
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v24
-; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v8, v2, v5, s[18:19]
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v23
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v7, v3, v5, s[16:17]
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v22
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v6, v4, v5, s[14:15]
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v14, v8, v7, s[10:11]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v29
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v13, v8, v7, s[8:9]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v28
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v12, v8, v7, s[6:7]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v11, v8, v7, s[4:5]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v26
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e32 v10, v8, v7, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v25
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21
+; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v9, v8, v7, s[18:19]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[12:13]
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[16:17]
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v23
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v17, s[4:5]
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[14:15]
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v18, s[6:7]
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v19, s[8:9]
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v20, v0, v20, s[10:11]
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v16, vcc
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v18, v16, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v19
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v16bf16:
@@ -37787,53 +37770,51 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v10
; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
-; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v11
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v12
; GFX8-NEXT: v_writelane_b32 v31, s30, 0
-; GFX8-NEXT: v_and_b32_e32 v2, 1, v12
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v13
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v13
; GFX8-NEXT: v_writelane_b32 v31, s31, 1
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v22
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v30
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v14
; GFX8-NEXT: v_writelane_b32 v31, s34, 2
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v11
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v14
-; GFX8-NEXT: v_and_b32_e32 v5, 1, v15
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v3, v2, s[28:29]
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v20
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v28
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v15
; GFX8-NEXT: v_writelane_b32 v31, s35, 3
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v1
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21]
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v2, s[20:21]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v21
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v29
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[24:25]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v19
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v27
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v4, s[16:17]
; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24
; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27]
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v27, v19, s[14:15]
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23]
-; GFX8-NEXT: v_cndmask_b32_e64 v11, v28, v20, s[18:19]
-; GFX8-NEXT: v_cndmask_b32_e64 v9, v26, v18, s[10:11]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v28, v20, s[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v18, s[10:11]
; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7]
-; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v12, v0, v23, s[30:31]
-; GFX8-NEXT: v_cndmask_b32_e64 v13, v2, v1, s[34:35]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[30:31]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[34:35]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26
; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13]
@@ -37846,11 +37827,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v11
; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v7, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readlane_b32 s35, v31, 3
; GFX8-NEXT: v_readlane_b32 s34, v31, 2
; GFX8-NEXT: v_readlane_b32 s31, v31, 1
@@ -37864,81 +37847,81 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX9-LABEL: v_vselect_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v4
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v14
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v4
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v15
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v4
-; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32
; GFX9-NEXT: v_and_b32_e32 v12, 1, v12
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX9-NEXT: v_and_b32_e32 v12, 1, v13
+; GFX9-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
; GFX9-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
; GFX9-NEXT: v_and_b32_e32 v10, 1, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v29, v21, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v29
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v21, vcc
+; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32
; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v6
-; GFX9-NEXT: v_and_b32_e32 v6, 1, v7
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v30
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v10
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v8
-; GFX9-NEXT: v_and_b32_e32 v8, 1, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[4:5]
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v29
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9]
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v28
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[12:13]
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v27
+; GFX9-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT: v_cndmask_b32_e32 v20, v28, v20, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v28
+; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX9-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v27
+; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX9-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v6, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v11, s[16:17]
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v23
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v30, v22, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v15, v26, v18, s[18:19]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v28, v20, s[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15]
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v5, v5, v8, s4
-; GFX9-NEXT: v_perm_b32 v6, v7, v6, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v14, v4, v23, s[20:21]
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v13, v4, v13, s[22:23]
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v26
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[24:25]
-; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v25
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v17, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v21, v23, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v21
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v6, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
-; GFX9-NEXT: v_perm_b32 v2, v4, v15, s4
-; GFX9-NEXT: v_perm_b32 v3, v11, v12, s4
-; GFX9-NEXT: v_perm_b32 v4, v9, v10, s4
-; GFX9-NEXT: v_perm_b32 v7, v13, v14, s4
+; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4
+; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4
+; GFX9-NEXT: v_perm_b32 v4, v8, v20, s4
+; GFX9-NEXT: v_perm_b32 v5, v10, v11, s4
+; GFX9-NEXT: v_perm_b32 v6, v13, v12, s4
+; GFX9-NEXT: v_perm_b32 v7, v7, v14, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v16bf16:
@@ -37955,13 +37938,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v21
; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v30, v22, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v20
; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX10-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
@@ -37970,13 +37953,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v25
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v24
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v18
; GFX10-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo
@@ -37995,11 +37978,11 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v51, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v30, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo
@@ -38012,12 +37995,12 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v31
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v31, v23, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v3, v32, vcc_lo
; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
-; GFX10-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
-; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v33, v22, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v13, v12, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v16bf16:
@@ -39408,219 +39391,206 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-LABEL: v_vselect_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: v_and_b32_e32 v29, 1, v29
+; GFX10-NEXT: s_clause 0xa
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
+; GFX10-NEXT: buffer_load_ushort v35, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64
+; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96
+; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108
+; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44
+; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112
; GFX10-NEXT: v_and_b32_e32 v30, 1, v30
+; GFX10-NEXT: v_and_b32_e32 v18, 1, v18
+; GFX10-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v18
; GFX10-NEXT: v_and_b32_e32 v28, 1, v28
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v13
+; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v19
; GFX10-NEXT: v_and_b32_e32 v26, 1, v26
; GFX10-NEXT: v_and_b32_e32 v24, 1, v24
; GFX10-NEXT: v_and_b32_e32 v22, 1, v22
; GFX10-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX10-NEXT: v_and_b32_e32 v18, 1, v18
+; GFX10-NEXT: v_and_b32_e32 v21, 1, v21
; GFX10-NEXT: v_and_b32_e32 v16, 1, v16
; GFX10-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX10-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX10-NEXT: s_clause 0x14
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX10-NEXT: buffer_load_ushort v33, off, s[0:3], s32
-; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128
-; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64
-; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48
-; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116
-; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52
-; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120
-; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56
-; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32
-; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100
-; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36
-; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104
-; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40
-; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108
-; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44
-; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:112
-; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:72
-; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:76
-; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:80
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92
-; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:28
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v30
-; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96
-; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v28
-; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88
-; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v26
-; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 1, v24
-; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84
-; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 1, v22
-; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
-; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 1, v20
-; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
-; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 1, v18
-; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GFX10-NEXT: v_cmp_eq_u32_e64 s11, 1, v16
-; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GFX10-NEXT: v_cmp_eq_u32_e64 s12, 1, v14
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
-; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:24
-; GFX10-NEXT: v_cmp_eq_u32_e64 s13, 1, v12
-; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT: v_and_b32_e32 v17, 1, v17
+; GFX10-NEXT: v_and_b32_e32 v15, 1, v15
; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
-; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX10-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX10-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX10-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX10-NEXT: v_and_b32_e32 v17, 1, v17
-; GFX10-NEXT: v_and_b32_e32 v19, 1, v19
-; GFX10-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX10-NEXT: v_and_b32_e32 v23, 1, v23
-; GFX10-NEXT: v_and_b32_e32 v25, 1, v25
-; GFX10-NEXT: v_and_b32_e32 v27, 1, v27
-; GFX10-NEXT: v_cmp_eq_u32_e64 s14, 1, v10
-; GFX10-NEXT: v_cmp_eq_u32_e64 s15, 1, v8
-; GFX10-NEXT: v_cmp_eq_u32_e64 s16, 1, v6
-; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2
-; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27
-; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25
-; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23
-; GFX10-NEXT: v_cmp_eq_u32_e64 s23, 1, v21
-; GFX10-NEXT: v_cmp_eq_u32_e64 s24, 1, v19
-; GFX10-NEXT: v_cmp_eq_u32_e64 s25, 1, v17
-; GFX10-NEXT: v_cmp_eq_u32_e64 s26, 1, v15
-; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13
-; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11
-; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3
-; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5
-; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9
-; GFX10-NEXT: s_waitcnt vmcnt(32)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31
-; GFX10-NEXT: s_waitcnt vmcnt(31)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v32
-; GFX10-NEXT: s_waitcnt vmcnt(30)
-; GFX10-NEXT: v_and_b32_e32 v2, 1, v33
-; GFX10-NEXT: s_waitcnt vmcnt(29)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v34
-; GFX10-NEXT: s_waitcnt vmcnt(28)
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v34, v35, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v35
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v32, v31, s5
-; GFX10-NEXT: s_waitcnt vmcnt(25)
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v37, v38, s7
-; GFX10-NEXT: s_waitcnt vmcnt(24)
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v39
-; GFX10-NEXT: s_waitcnt vmcnt(23)
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v39, v48, s6
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v48
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v38
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v37
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v36
-; GFX10-NEXT: s_waitcnt vmcnt(18)
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v52, v53, s10
-; GFX10-NEXT: s_waitcnt vmcnt(17)
-; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v54
-; GFX10-NEXT: s_waitcnt vmcnt(16)
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v54, v55, s9
-; GFX10-NEXT: s_waitcnt vmcnt(15)
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v64, v36, s8
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v64
-; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v55
-; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v53
-; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v52
-; GFX10-NEXT: v_cndmask_b32_e64 v33, v50, v51, s11
-; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v51
-; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v50
+; GFX10-NEXT: s_waitcnt vmcnt(10)
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v31
; GFX10-NEXT: s_waitcnt vmcnt(9)
-; GFX10-NEXT: v_cndmask_b32_e64 v36, v30, v49, s12
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49
-; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX10-NEXT: v_cndmask_b32_e64 v38, v29, v68, s13
-; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v68
-; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v32
+; GFX10-NEXT: s_waitcnt vmcnt(8)
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v33
+; GFX10-NEXT: s_waitcnt vmcnt(7)
+; GFX10-NEXT: v_cndmask_b32_e64 v18, v34, v33, s6
; GFX10-NEXT: s_waitcnt vmcnt(6)
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v24, v22, s15
-; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX10-NEXT: s_waitcnt vmcnt(5)
-; GFX10-NEXT: v_cndmask_b32_e64 v50, v67, v20, s16
-; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v67
+; GFX10-NEXT: v_and_b32_e32 v35, 1, v35
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v12
; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_cndmask_b32_e64 v52, v66, v18, s17
-; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cndmask_b32_e64 v48, v28, v26, s14
-; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v66
-; GFX10-NEXT: v_cndmask_b32_e64 v54, v65, v16, s18
-; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v65
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v35
+; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v34
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v32, v31, s6
+; GFX10-NEXT: s_clause 0x6
+; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76
+; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12
+; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80
+; GFX10-NEXT: v_cndmask_b32_e64 v30, v50, v30, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:124
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX10-NEXT: v_and_b32_e32 v28, 1, v29
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v51, v13, s5
+; GFX10-NEXT: s_waitcnt vmcnt(3)
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v52
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e64 v64, v14, v12, s19
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v65, v1, v0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v66, v6, v5, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v67, v8, v7, s21
-; GFX10-NEXT: v_cndmask_b32_e64 v68, v10, v9, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v25, v23, s23
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v32, v31, s24
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v35, v34, s25
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v30, v37, s26
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34
-; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4
-; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100
-; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100
-; GFX10-NEXT: v_perm_b32 v2, v2, v52, 0x5040100
-; GFX10-NEXT: v_perm_b32 v3, v20, v50, 0x5040100
-; GFX10-NEXT: v_perm_b32 v4, v12, v49, 0x5040100
-; GFX10-NEXT: v_perm_b32 v5, v5, v48, 0x5040100
-; GFX10-NEXT: v_perm_b32 v6, v6, v38, 0x5040100
-; GFX10-NEXT: v_perm_b32 v7, v7, v36, 0x5040100
-; GFX10-NEXT: v_perm_b32 v8, v8, v33, 0x5040100
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x5040100
-; GFX10-NEXT: v_perm_b32 v10, v10, v21, 0x5040100
-; GFX10-NEXT: v_perm_b32 v11, v68, v11, 0x5040100
-; GFX10-NEXT: v_perm_b32 v12, v67, v19, 0x5040100
-; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100
-; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100
-; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
-; GFX10-NEXT: v_readlane_b32 s34, v40, 2
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
-; GFX10-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v36, v37, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX10-NEXT: v_and_b32_e32 v26, 1, v27
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX10-NEXT: v_and_b32_e32 v24, 1, v25
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v36, v37, vcc_lo
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX10-NEXT: v_and_b32_e32 v22, 1, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v49, v36, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v37, v36, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v48
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v39
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v39, v48, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
+; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v36, v37, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v38, v39, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v36, v37, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v38, v39, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84
+; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v36, v37, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v38, v39, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v53, v48, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v34, v52, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v32, v33, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v31, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v37, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v49, v48, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v32, v33, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v31, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v34, v50, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
+; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v30, v12, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v39, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
+; GFX10-NEXT: v_perm_b32 v10, v21, v20, 0x5040100
+; GFX10-NEXT: v_perm_b32 v11, v22, v23, 0x5040100
+; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
+; GFX10-NEXT: v_perm_b32 v8, v17, v16, 0x5040100
+; GFX10-NEXT: v_perm_b32 v9, v13, v18, 0x5040100
+; GFX10-NEXT: v_perm_b32 v12, v24, v25, 0x5040100
+; GFX10-NEXT: v_perm_b32 v13, v26, v27, 0x5040100
+; GFX10-NEXT: v_perm_b32 v14, v28, v29, 0x5040100
+; GFX10-NEXT: v_perm_b32 v15, v35, v54, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v32bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index dd9c9a3699b4ff..05c2e0077f4aea 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -4,13 +4,13 @@
define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 {
; CHECK-LABEL: spill:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_load_dword s44, s[8:9], 0x2
+; CHECK-NEXT: s_load_dword s27, s[8:9], 0x2
; CHECK-NEXT: s_mov_b64 s[98:99], s[2:3]
; CHECK-NEXT: s_mov_b64 s[96:97], s[0:1]
; CHECK-NEXT: s_add_u32 s96, s96, s15
; CHECK-NEXT: s_addc_u32 s97, s97, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_cmp_eq_u32 s44, 0
+; CHECK-NEXT: s_cmp_eq_u32 s27, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: ;;#ASMEND
@@ -971,10 +971,10 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v1, s98, 3
; CHECK-NEXT: v_writelane_b32 v0, s92, 61
; CHECK-NEXT: v_writelane_b32 v1, s99, 4
-; CHECK-NEXT: s_mov_b32 s49, s12
+; CHECK-NEXT: s_mov_b32 s31, s12
; CHECK-NEXT: v_writelane_b32 v0, s93, 62
; CHECK-NEXT: v_writelane_b32 v1, s100, 5
-; CHECK-NEXT: s_cmp_eq_u32 s49, 0
+; CHECK-NEXT: s_cmp_eq_u32 s31, 0
; CHECK-NEXT: v_writelane_b32 v0, s94, 63
; CHECK-NEXT: v_writelane_b32 v1, s101, 6
; CHECK-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 0009a84765639c..56ecfa298a348f 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -2487,10 +2487,10 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
; SI-NEXT: v_or_b32_e32 v1, v31, v1
; SI-NEXT: v_or_b32_e32 v5, v27, v5
; SI-NEXT: v_or_b32_e32 v9, v23, v9
+; SI-NEXT: v_or_b32_e32 v13, v19, v13
; SI-NEXT: v_and_b32_e32 v17, 3, v28
; SI-NEXT: v_and_b32_e32 v18, 3, v24
-; SI-NEXT: v_and_b32_e32 v20, 3, v20
-; SI-NEXT: v_or_b32_e32 v13, v19, v13
+; SI-NEXT: v_and_b32_e32 v19, 3, v20
; SI-NEXT: v_and_b32_e32 v16, 3, v16
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: v_and_b32_e32 v12, 3, v12
@@ -2502,7 +2502,7 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
; SI-NEXT: v_and_b32_e32 v0, 3, v0
; SI-NEXT: v_or_b32_e32 v1, v17, v1
; SI-NEXT: v_or_b32_e32 v3, v18, v5
-; SI-NEXT: v_or_b32_e32 v5, v20, v9
+; SI-NEXT: v_or_b32_e32 v5, v19, v9
; SI-NEXT: v_or_b32_e32 v7, v16, v13
; SI-NEXT: v_or_b32_e32 v9, v12, v14
; SI-NEXT: v_or_b32_e32 v8, v8, v10
diff --git a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
index 00eb2b7e1aa8dd..4945c7020ca18c 100644
--- a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
@@ -49,39 +49,39 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec
- ; CHECK-NEXT: dead undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
- ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
- ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub1
+ ; CHECK-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF3]].sub0, [[DEF5]].sub0, 0, implicit $exec
+ ; CHECK-NEXT: dead undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[DEF3]].sub1, [[DEF5]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+ ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub1
; CHECK-NEXT: dead [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]].sub0
- ; CHECK-NEXT: dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF6]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF7]], 288, 0, implicit $exec :: (store (s64), addrspace 1)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: undef [[DEF5:%[0-9]+]].sub1:vreg_64 = COPY [[COPY5]]
+ ; CHECK-NEXT: undef [[DEF4:%[0-9]+]].sub1:vreg_64 = COPY [[COPY5]]
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
index cdd4c72f3717f0..8a1c68b3f66150 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
@@ -24,7 +24,7 @@ body: |
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
@@ -32,10 +32,9 @@ body: |
; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF]], implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF1]], implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
@@ -51,33 +50,34 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]]
- ; CHECK-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MUL_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
- ; CHECK-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
- ; CHECK-NEXT: dead [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF13]], implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF12]], implicit $mode, implicit $exec
; CHECK-NEXT: dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF13:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: $sgpr4 = IMPLICIT_DEF
- ; CHECK-NEXT: $vgpr0 = COPY [[DEF11]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[DEF10]]
; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]]
- ; CHECK-NEXT: $vgpr1 = COPY [[DEF7]]
+ ; CHECK-NEXT: $vgpr1 = COPY [[DEF6]]
; CHECK-NEXT: $vgpr0 = COPY [[V_MUL_F32_e32_1]]
; CHECK-NEXT: $vgpr1 = COPY [[V_MUL_F32_e32_2]]
; CHECK-NEXT: $vgpr2 = COPY [[V_MUL_F32_e32_3]]
- ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0
- ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF8]], implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], [[V_ADD_F32_e32_1]], implicit $mode, implicit $exec
- ; CHECK-NEXT: dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, implicit $exec
+ ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF13]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0
+ ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF7]], implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF11]], [[DEF8]], [[V_ADD_F32_e32_1]], implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF3]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF14]], [[DEF9]], 0, 0, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index d9182d7ace8bfe..59bc7f332bf1e4 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -152,38 +152,38 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: .LBB0_3: ; %udiv-do-while
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[4:5]
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11]
-; GFX9-NEXT: v_or_b32_e32 v4, v14, v4
+; GFX9-NEXT: v_or_b32_e32 v4, v14, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v9
; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-NEXT: v_or_b32_e32 v5, v15, v31
+; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11]
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 31, v3
+; GFX9-NEXT: v_or_b32_e32 v8, v8, v15
; GFX9-NEXT: v_or_b32_e32 v10, v10, v14
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v3
-; GFX9-NEXT: v_or_b32_e32 v8, v8, v14
; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v26, v8
; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v27, v9, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v28, v10, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v29, v11, vcc
; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v14
; GFX9-NEXT: v_and_b32_e32 v14, v30, v21
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v14
; GFX9-NEXT: v_and_b32_e32 v14, v30, v20
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v14, vcc
-; GFX9-NEXT: v_and_b32_e32 v14, v30, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v14, vcc
+; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12
+; GFX9-NEXT: v_and_b32_e32 v6, v30, v0
; GFX9-NEXT: v_and_b32_e32 v14, v30, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v6, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v14, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22
; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc
-; GFX9-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v14, v22, v24
; GFX9-NEXT: v_or_b32_e32 v15, v23, v25
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12
; GFX9-NEXT: v_and_b32_e32 v6, 1, v30
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13
@@ -1227,13 +1227,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_ashrrev_i32_e32 v16, 31, v3
; GFX9-G-NEXT: v_xor_b32_e32 v0, v16, v0
; GFX9-G-NEXT: v_xor_b32_e32 v1, v16, v1
-; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v16
+; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v16
; GFX9-G-NEXT: v_xor_b32_e32 v2, v16, v2
-; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v16, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v16, vcc
; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7
; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3
-; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc
-; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v16, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v2, v16, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v16, vcc
; GFX9-G-NEXT: v_xor_b32_e32 v0, v17, v4
; GFX9-G-NEXT: v_xor_b32_e32 v1, v17, v5
; GFX9-G-NEXT: v_sub_co_u32_e32 v18, vcc, v0, v17
@@ -1245,8 +1245,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4
; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5
; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v12
-; GFX9-G-NEXT: v_or_b32_e32 v1, v11, v13
+; GFX9-G-NEXT: v_or_b32_e32 v0, v8, v10
+; GFX9-G-NEXT: v_or_b32_e32 v1, v9, v11
; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18
; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19
@@ -1258,15 +1258,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5]
; GFX9-G-NEXT: v_add_u32_e32 v0, 64, v0
; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2
-; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v10
+; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v8
; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7]
-; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v11
+; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v9
; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2
-; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12
+; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v10
; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2
-; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v13
+; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v11
; GFX9-G-NEXT: v_add_u32_e32 v3, 32, v3
-; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[12:13]
+; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
; GFX9-G-NEXT: v_add_u32_e32 v1, 64, v1
; GFX9-G-NEXT: v_min_u32_e32 v2, v2, v3
; GFX9-G-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[6:7]
@@ -1291,10 +1291,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2
; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20
; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc
; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14
@@ -1309,23 +1309,23 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc
; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc
; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, 0x7f, v0
-; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8
-; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11]
-; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13]
-; GFX9-G-NEXT: v_add_u32_e32 v9, 0xffffffc0, v8
-; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11]
+; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, 0x7f, v0
+; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v12
+; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9]
+; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v12, v[10:11]
+; GFX9-G-NEXT: v_add_u32_e32 v13, 0xffffffc0, v12
+; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v12, v[8:9]
; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2
; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v9, v[10:11]
-; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v13, v[8:9]
+; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12
; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
-; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc
+; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
+; GFX9-G-NEXT: v_cndmask_b32_e32 v12, v0, v10, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v13, v1, v11, vcc
; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX9-G-NEXT: v_mov_b32_e32 v0, s8
; GFX9-G-NEXT: v_mov_b32_e32 v1, s9
@@ -1336,13 +1336,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: s_cbranch_execz .LBB0_5
; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader
; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20
-; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11]
-; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13]
+; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[8:9]
+; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11]
; GFX9-G-NEXT: v_add_u32_e32 v24, 0xffffffc0, v20
-; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13]
+; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[10:11]
; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2
; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[12:13]
+; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11]
; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20
; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1352,54 +1352,54 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: s_mov_b64 s[8:9], 0
; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20
; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v10, s[4:5]
-; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v11, s[4:5]
+; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, v8, s[4:5]
+; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5]
; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc
; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX9-G-NEXT: v_mov_b32_e32 v0, s8
; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc
-; GFX9-G-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-G-NEXT: v_mov_b32_e32 v9, 0
; GFX9-G-NEXT: v_mov_b32_e32 v1, s9
; GFX9-G-NEXT: v_mov_b32_e32 v2, s10
; GFX9-G-NEXT: v_mov_b32_e32 v3, s11
; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while
; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7]
-; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v7
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v8, 31, v7
; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2
; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3
-; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[12:13]
-; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9
+; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[10:11]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v13
; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15]
-; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12
-; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v13
-; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v24, v2
+; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v10
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v11
+; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v24, v2
; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v14
-; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v3, vcc
-; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v26, v0, vcc
-; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v27, v1, vcc
-; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v12
-; GFX9-G-NEXT: v_and_b32_e32 v12, v28, v18
-; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v2, v12
-; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v19
-; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v2, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v25, v3, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v26, v0, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v27, v1, vcc
+; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v10
+; GFX9-G-NEXT: v_and_b32_e32 v10, v28, v18
+; GFX9-G-NEXT: v_and_b32_e32 v11, v28, v19
+; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v10
+; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v11, vcc
; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v4
+; GFX9-G-NEXT: v_and_b32_e32 v3, v28, v5
; GFX9-G-NEXT: v_subb_co_u32_e32 v14, vcc, v0, v2, vcc
-; GFX9-G-NEXT: v_and_b32_e32 v0, v28, v5
-; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v0, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v3, vcc
; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, -1, v20
; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc
; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc
; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc
-; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13]
; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22
; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23
; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v10
-; GFX9-G-NEXT: v_and_b32_e32 v10, 1, v28
-; GFX9-G-NEXT: v_mov_b32_e32 v0, v10
+; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v8
+; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v28
+; GFX9-G-NEXT: v_mov_b32_e32 v0, v8
; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX9-G-NEXT: v_mov_b32_e32 v1, v11
+; GFX9-G-NEXT: v_mov_b32_e32 v1, v9
; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3
; GFX9-G-NEXT: ; %bb.4: ; %Flow
@@ -1407,9 +1407,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: .LBB0_5: ; %Flow2
; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13]
; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7]
-; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13]
; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7
-; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v4
+; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v4
; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2
; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3
; GFX9-G-NEXT: .LBB0_6: ; %Flow3
@@ -1418,9 +1418,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3
; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3
; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3
-; GFX9-G-NEXT: v_xor_b32_e32 v2, v8, v3
+; GFX9-G-NEXT: v_xor_b32_e32 v2, v12, v3
; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-G-NEXT: v_xor_b32_e32 v4, v9, v3
+; GFX9-G-NEXT: v_xor_b32_e32 v4, v13, v3
; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
; GFX9-G-NEXT: s_setpc_b64 s[30:31]
@@ -2439,16 +2439,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: .LBB1_3: ; %udiv-do-while
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_lshlrev_b64 v[26:27], 1, v[10:11]
; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11
-; GFX9-NEXT: v_or_b32_e32 v10, v16, v26
+; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11]
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT: v_or_b32_e32 v10, v16, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v1
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT: v_or_b32_e32 v11, v17, v27
-; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 31, v9
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v17
; GFX9-NEXT: v_or_b32_e32 v2, v2, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v9
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v16
; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v22, v0
; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v23, v1, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v24, v2, vcc
@@ -2457,20 +2456,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_and_b32_e32 v16, v26, v4
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v16
; GFX9-NEXT: v_and_b32_e32 v16, v26, v5
-; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v16, vcc
; GFX9-NEXT: v_and_b32_e32 v16, v26, v6
-; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14
-; GFX9-NEXT: v_and_b32_e32 v12, v26, v7
; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v16, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v12, vcc
+; GFX9-NEXT: v_and_b32_e32 v16, v26, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v16, vcc
; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, -1, v18
; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc
+; GFX9-NEXT: v_or_b32_e32 v11, v17, v11
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
; GFX9-NEXT: v_or_b32_e32 v16, v18, v20
; GFX9-NEXT: v_or_b32_e32 v17, v19, v21
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14
; GFX9-NEXT: v_and_b32_e32 v12, 1, v26
; GFX9-NEXT: v_mov_b32_e32 v17, v13
; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15
@@ -3506,37 +3506,37 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_lshrrev_b32_e32 v0, 31, v15
; GFX9-G-NEXT: v_or_b32_e32 v14, v10, v12
; GFX9-G-NEXT: v_or_b32_e32 v15, v11, v13
-; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[16:17]
-; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3]
-; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v3
-; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v2
-; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v9
-; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
-; GFX9-G-NEXT: v_or_b32_e32 v2, v10, v2
-; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0
-; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v22, v2
-; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v23, v11, vcc
-; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v24, v12, vcc
-; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v25, v13, vcc
-; GFX9-G-NEXT: v_add_co_u32_e64 v18, s[4:5], -1, v18
-; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v0
-; GFX9-G-NEXT: v_addc_co_u32_e64 v19, s[4:5], -1, v19, s[4:5]
-; GFX9-G-NEXT: v_and_b32_e32 v10, v3, v4
-; GFX9-G-NEXT: v_addc_co_u32_e64 v20, s[4:5], -1, v20, s[4:5]
-; GFX9-G-NEXT: v_and_b32_e32 v16, v3, v5
-; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v10
-; GFX9-G-NEXT: v_addc_co_u32_e64 v21, s[4:5], -1, v21, s[4:5]
-; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v3
-; GFX9-G-NEXT: v_and_b32_e32 v17, v3, v6
-; GFX9-G-NEXT: v_and_b32_e32 v26, v3, v7
-; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v11, v16, vcc
+; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[16:17]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v3
+; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9
+; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12
+; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v22, v2
+; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v23, v3, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v24, v10, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v11, vcc
+; GFX9-G-NEXT: v_ashrrev_i32_e32 v12, 31, v12
+; GFX9-G-NEXT: v_and_b32_e32 v13, v12, v4
+; GFX9-G-NEXT: v_and_b32_e32 v16, v12, v5
+; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v13
+; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v16, vcc
+; GFX9-G-NEXT: v_and_b32_e32 v13, v12, v6
+; GFX9-G-NEXT: v_and_b32_e32 v17, v12, v7
+; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v10, v13, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v11, v17, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v18, vcc, -1, v18
+; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc
; GFX9-G-NEXT: v_or_b32_e32 v10, v18, v20
; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21
-; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v12, v17, vcc
+; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0
+; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v12
; GFX9-G-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v13, v26, vcc
-; GFX9-G-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
+; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX9-G-NEXT: v_mov_b32_e32 v10, v0
; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 691f3d36bc7360..8d65fa053eaa49 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -6,430 +6,430 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-LABEL: v_sdiv_v2i128_vv:
; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v3
-; SDAG-NEXT: v_ashrrev_i32_e32 v27, 31, v11
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3
+; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT: v_mov_b32_e32 v28, v26
-; SDAG-NEXT: v_mov_b32_e32 v29, v27
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
-; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc
+; SDAG-NEXT: v_mov_b32_e32 v26, v24
+; SDAG-NEXT: v_mov_b32_e32 v27, v25
+; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v1, v16
-; SDAG-NEXT: v_ffbh_u32_e32 v18, v17
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[4:5]
-; SDAG-NEXT: v_sub_i32_e32 v20, vcc, 0, v8
-; SDAG-NEXT: v_or_b32_e32 v0, v16, v2
-; SDAG-NEXT: v_ffbh_u32_e32 v21, v2
-; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v1
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v1, v20
+; SDAG-NEXT: v_ffbh_u32_e32 v2, v21
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v0, v20, v16
+; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8
+; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v16
+; SDAG-NEXT: v_or_b32_e32 v1, v21, v17
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT: v_or_b32_e32 v1, v17, v3
-; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v21
-; SDAG-NEXT: v_min_u32_e32 v18, v22, v18
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v3
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v20, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT: v_min_u32_e32 v1, v21, v22
-; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18
-; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v9, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v9, v31
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v18, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v18, v8, v1, vcc
-; SDAG-NEXT: v_ffbh_u32_e32 v21, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v20, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v8, v31, v0
+; SDAG-NEXT: v_min_u32_e32 v2, v19, v2
+; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v17
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7]
+; SDAG-NEXT: v_min_u32_e32 v1, v19, v22
+; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2
+; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[6:7]
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v3, v29
+; SDAG-NEXT: v_ffbh_u32_e32 v19, v28
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7]
+; SDAG-NEXT: v_or_b32_e32 v2, v29, v0
+; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3
; SDAG-NEXT: v_ffbh_u32_e32 v11, v0
-; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v9
-; SDAG-NEXT: v_or_b32_e32 v9, v30, v1
+; SDAG-NEXT: v_or_b32_e32 v3, v28, v1
+; SDAG-NEXT: v_min_u32_e32 v8, v8, v19
; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11
-; SDAG-NEXT: v_min_u32_e32 v20, v20, v21
-; SDAG-NEXT: v_ffbh_u32_e32 v21, v1
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_min_u32_e32 v8, v11, v21
-; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v20
-; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v18
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v10, vcc
-; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v8
-; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v18
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v11, v9, v19
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v19, v1
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_min_u32_e32 v2, v11, v19
+; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8
+; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7]
+; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc
+; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2
+; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v18, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v18, vcc
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v10
+; SDAG-NEXT: v_or_b32_e32 v9, v3, v11
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_and_b32_e32 v10, 1, v20
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10
+; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v19, s[4:5]
+; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v3, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v2, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v17, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v23, v16, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v16, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v23, v20, 0, s[4:5]
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB0_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
-; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v8
-; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v2
+; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v2
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18
+; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
+; SDAG-NEXT: v_or_b32_e32 v10, v30, v32
+; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v2
+; SDAG-NEXT: v_or_b32_e32 v11, v31, v33
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], v34
+; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[20:21], v34
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[20:21], v35
+; SDAG-NEXT: v_or_b32_e32 v3, v3, v11
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v10
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v22, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v10, 0
; SDAG-NEXT: v_mov_b32_e32 v11, 0
-; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20
-; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc
-; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc
-; SDAG-NEXT: v_or_b32_e32 v18, v32, v34
-; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v8
-; SDAG-NEXT: v_or_b32_e32 v19, v33, v35
-; SDAG-NEXT: v_lshl_b64 v[8:9], v[2:3], v24
-; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25
-; SDAG-NEXT: v_or_b32_e32 v9, v9, v19
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v18
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB0_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v32
-; SDAG-NEXT: v_sub_i32_e32 v37, vcc, 64, v32
-; SDAG-NEXT: v_subrev_i32_e32 v48, vcc, 64, v32
-; SDAG-NEXT: v_lshr_b64 v[24:25], v[2:3], v32
-; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31
-; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v22, 0
-; SDAG-NEXT: v_mov_b32_e32 v23, 0
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
-; SDAG-NEXT: v_lshl_b64 v[38:39], v[2:3], v37
-; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v48
-; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc
-; SDAG-NEXT: v_or_b32_e32 v11, v11, v39
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v38
-; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v0, vcc
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v25, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v24, s[4:5]
-; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v1, vcc
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
-; SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v30
+; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v30
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v10
+; SDAG-NEXT: v_or_b32_e32 v11, v9, v11
+; SDAG-NEXT: v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30
+; SDAG-NEXT: v_subrev_i32_e64 v8, s[4:5], 64, v30
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v8
+; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, v21, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v8, v20, s[4:5]
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v30
+; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v9, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v8, vcc
+; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29
+; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc
+; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc
+; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc
+; SDAG-NEXT: s_mov_b64 s[4:5], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: v_mov_b32_e32 v10, 0
; SDAG-NEXT: v_mov_b32_e32 v11, 0
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshl_b64 v[16:17], v[24:25], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v9
-; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v25, 31, v21
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v19
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v21
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT: v_or_b32_e32 v16, v16, v10
-; SDAG-NEXT: v_or_b32_e32 v2, v2, v24
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v25
-; SDAG-NEXT: v_or_b32_e32 v9, v19, v9
-; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v36, v2
-; SDAG-NEXT: v_or_b32_e32 v8, v18, v8
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v3, vcc
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v38, v16, vcc
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v39, v17, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v10
-; SDAG-NEXT: v_and_b32_e32 v25, v24, v31
-; SDAG-NEXT: v_and_b32_e32 v48, v24, v30
-; SDAG-NEXT: v_and_b32_e32 v49, v24, v0
-; SDAG-NEXT: v_and_b32_e32 v10, 1, v24
-; SDAG-NEXT: v_and_b32_e32 v50, v24, v1
-; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v25
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc
-; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v16, v49, vcc
-; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v17, v50, vcc
-; SDAG-NEXT: v_add_i32_e32 v32, vcc, -1, v32
+; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v3
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_or_b32_e32 v19, v17, v19
+; SDAG-NEXT: v_or_b32_e32 v18, v16, v18
+; SDAG-NEXT: v_or_b32_e32 v16, v22, v38
+; SDAG-NEXT: v_or_b32_e32 v17, v20, v39
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v8
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v17
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v21, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v16, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v23, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; SDAG-NEXT: v_and_b32_e32 v20, v8, v29
+; SDAG-NEXT: v_and_b32_e32 v22, v8, v28
+; SDAG-NEXT: v_and_b32_e32 v38, v8, v0
+; SDAG-NEXT: v_and_b32_e32 v39, v8, v1
+; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
+; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v17, v20
+; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v21, v22, vcc
+; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v16, v38, vcc
+; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v39, vcc
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
-; SDAG-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc
-; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc
-; SDAG-NEXT: v_or_b32_e32 v16, v32, v34
-; SDAG-NEXT: v_or_b32_e32 v17, v33, v35
+; SDAG-NEXT: v_or_b32_e32 v16, v30, v32
+; SDAG-NEXT: v_or_b32_e32 v17, v31, v33
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_or_b32_e32 v21, v23, v21
-; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v20, v22, v20
-; SDAG-NEXT: v_mov_b32_e32 v23, v11
-; SDAG-NEXT: v_mov_b32_e32 v22, v10
-; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT: v_or_b32_e32 v3, v11, v3
+; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v2, v10, v2
+; SDAG-NEXT: v_mov_b32_e32 v17, v9
+; SDAG-NEXT: v_mov_b32_e32 v16, v8
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
; SDAG-NEXT: s_cbranch_execnz .LBB0_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
-; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; SDAG-NEXT: .LBB0_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
-; SDAG-NEXT: v_or_b32_e32 v0, v0, v8
-; SDAG-NEXT: v_or_b32_e32 v20, v19, v1
-; SDAG-NEXT: v_or_b32_e32 v22, v11, v3
-; SDAG-NEXT: v_or_b32_e32 v21, v18, v0
-; SDAG-NEXT: v_or_b32_e32 v23, v10, v2
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[18:19], 1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v16
+; SDAG-NEXT: v_or_b32_e32 v18, v11, v1
+; SDAG-NEXT: v_or_b32_e32 v19, v9, v3
+; SDAG-NEXT: v_or_b32_e32 v22, v10, v0
+; SDAG-NEXT: v_or_b32_e32 v23, v8, v2
; SDAG-NEXT: .LBB0_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7
; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15
; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT: v_mov_b32_e32 v18, v16
-; SDAG-NEXT: v_mov_b32_e32 v19, v17
+; SDAG-NEXT: v_mov_b32_e32 v20, v16
+; SDAG-NEXT: v_mov_b32_e32 v21, v17
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5]
; SDAG-NEXT: v_ffbh_u32_e32 v1, v2
-; SDAG-NEXT: v_ffbh_u32_e32 v6, v3
-; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[4:5]
-; SDAG-NEXT: v_sub_i32_e32 v7, vcc, 0, v12
-; SDAG-NEXT: v_or_b32_e32 v0, v2, v4
-; SDAG-NEXT: v_ffbh_u32_e32 v8, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v4, v3
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0, v12
+; SDAG-NEXT: v_or_b32_e32 v0, v2, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v9, v6
; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v1
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v13, vcc
-; SDAG-NEXT: v_or_b32_e32 v1, v3, v5
-; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], 32, v8
-; SDAG-NEXT: v_ffbh_u32_e32 v30, v5
-; SDAG-NEXT: v_min_u32_e32 v6, v10, v6
+; SDAG-NEXT: v_or_b32_e32 v1, v3, v7
+; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 32, v9
+; SDAG-NEXT: v_ffbh_u32_e32 v30, v7
+; SDAG-NEXT: v_min_u32_e32 v4, v10, v4
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v14, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
-; SDAG-NEXT: v_cndmask_b32_e64 v24, v13, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v25, v12, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v5, s[4:5]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT: v_min_u32_e32 v1, v8, v30
-; SDAG-NEXT: v_add_i32_e64 v6, s[8:9], 64, v6
-; SDAG-NEXT: v_addc_u32_e64 v7, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v15, vcc
+; SDAG-NEXT: v_min_u32_e32 v1, v9, v30
+; SDAG-NEXT: v_add_i32_e64 v4, s[8:9], 64, v4
+; SDAG-NEXT: v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v15, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v10, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v10, v25
-; SDAG-NEXT: v_ffbh_u32_e32 v11, v24
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v7, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v13, v6, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v8, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v6, v25, v0
-; SDAG-NEXT: v_ffbh_u32_e32 v8, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v10, v29
+; SDAG-NEXT: v_ffbh_u32_e32 v11, v28
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v13, v4, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v9, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v4, v29, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v9, v0
; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10
-; SDAG-NEXT: v_or_b32_e32 v7, v24, v1
-; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v8
+; SDAG-NEXT: v_or_b32_e32 v5, v28, v1
+; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v9
; SDAG-NEXT: v_ffbh_u32_e32 v14, v1
; SDAG-NEXT: v_min_u32_e32 v10, v10, v11
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT: v_min_u32_e32 v6, v8, v14
-; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 64, v10
-; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_min_u32_e32 v4, v9, v14
+; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v10
+; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
-; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v13
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v12, vcc
-; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v6
-; SDAG-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v9, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc
+; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4
+; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v8, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; SDAG-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v8
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v8, vcc
+; SDAG-NEXT: v_or_b32_e32 v8, v9, v10
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v11, v7, v9
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_or_b32_e32 v9, v5, v11
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_and_b32_e32 v10, 1, v12
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_and_b32_e32 v8, 1, v12
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, v7, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v2, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB0_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6
-; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6
-; SDAG-NEXT: v_mov_b32_e32 v10, 0
-; SDAG-NEXT: v_mov_b32_e32 v11, 0
-; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v4
+; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v4
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v5, vcc
; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12
-; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v8, vcc
-; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc
-; SDAG-NEXT: v_or_b32_e32 v7, v30, v32
-; SDAG-NEXT: v_sub_i32_e32 v9, vcc, 0x7f, v6
-; SDAG-NEXT: v_or_b32_e32 v8, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v9
-; SDAG-NEXT: v_sub_i32_e32 v6, vcc, 64, v9
-; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v9
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; SDAG-NEXT: v_lshr_b64 v[6:7], v[2:3], v6
-; SDAG-NEXT: v_or_b32_e32 v7, v15, v7
-; SDAG-NEXT: v_or_b32_e32 v6, v14, v6
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v9
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v13, v7, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v35, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v34, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v5, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5]
+; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
+; SDAG-NEXT: v_or_b32_e32 v10, v30, v32
+; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v4
+; SDAG-NEXT: v_or_b32_e32 v11, v31, v33
+; SDAG-NEXT: v_lshl_b64 v[4:5], v[6:7], v34
+; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35
+; SDAG-NEXT: v_or_b32_e32 v5, v5, v11
+; SDAG-NEXT: v_or_b32_e32 v4, v4, v10
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: v_mov_b32_e32 v13, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB0_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
-; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v30
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[2:3], v30
; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30
; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30
-; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30
-; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v25
+; SDAG-NEXT: v_lshr_b64 v[37:38], v[6:7], v30
+; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29
; SDAG-NEXT: s_mov_b64 s[10:11], 0
; SDAG-NEXT: v_mov_b32_e32 v14, 0
; SDAG-NEXT: v_mov_b32_e32 v15, 0
; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: v_mov_b32_e32 v13, 0
-; SDAG-NEXT: v_lshl_b64 v[48:49], v[4:5], v35
-; SDAG-NEXT: v_lshr_b64 v[4:5], v[4:5], v36
-; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v24, vcc
-; SDAG-NEXT: v_or_b32_e32 v11, v11, v49
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v48
+; SDAG-NEXT: v_lshl_b64 v[48:49], v[6:7], v35
+; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v36
+; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc
+; SDAG-NEXT: v_or_b32_e32 v9, v9, v49
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v48
; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v10, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v38, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v37, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v38, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v37, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
-; SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
-; SDAG-NEXT: v_mov_b32_e32 v11, 0
+; SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: .LBB0_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9
-; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v7
; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
-; SDAG-NEXT: v_or_b32_e32 v4, v4, v10
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v3
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v5
+; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v11
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v8
; SDAG-NEXT: v_or_b32_e32 v2, v2, v38
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v39
-; SDAG-NEXT: v_or_b32_e32 v9, v13, v9
-; SDAG-NEXT: v_or_b32_e32 v7, v15, v7
-; SDAG-NEXT: v_or_b32_e32 v8, v12, v8
-; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v34, v2
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v35, v3, vcc
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v36, v4, vcc
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v5, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v15, 31, v10
-; SDAG-NEXT: v_and_b32_e32 v10, 1, v15
-; SDAG-NEXT: v_and_b32_e32 v38, v15, v1
-; SDAG-NEXT: v_and_b32_e32 v39, v15, v0
-; SDAG-NEXT: v_and_b32_e32 v48, v15, v24
-; SDAG-NEXT: v_and_b32_e32 v15, v15, v25
+; SDAG-NEXT: v_or_b32_e32 v4, v4, v39
+; SDAG-NEXT: v_or_b32_e32 v5, v13, v5
+; SDAG-NEXT: v_or_b32_e32 v11, v15, v11
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v2
+; SDAG-NEXT: v_or_b32_e32 v4, v12, v4
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v3, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v6, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v7, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; SDAG-NEXT: v_and_b32_e32 v15, v8, v29
+; SDAG-NEXT: v_and_b32_e32 v38, v8, v28
+; SDAG-NEXT: v_and_b32_e32 v39, v8, v0
+; SDAG-NEXT: v_and_b32_e32 v48, v8, v1
; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc
-; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v39, vcc
-; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v38, vcc
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v38, vcc
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v39, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v48, vcc
; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
-; SDAG-NEXT: v_or_b32_e32 v39, v31, v33
; SDAG-NEXT: v_or_b32_e32 v38, v30, v32
+; SDAG-NEXT: v_or_b32_e32 v39, v31, v33
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39]
+; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v6, v14, v6
-; SDAG-NEXT: v_mov_b32_e32 v15, v11
-; SDAG-NEXT: v_mov_b32_e32 v14, v10
+; SDAG-NEXT: v_or_b32_e32 v10, v14, v10
+; SDAG-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-NEXT: v_mov_b32_e32 v14, v8
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB0_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB0_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v7
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], 1
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[4:5], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1
; SDAG-NEXT: v_or_b32_e32 v0, v0, v4
; SDAG-NEXT: v_or_b32_e32 v13, v13, v1
-; SDAG-NEXT: v_or_b32_e32 v14, v11, v3
-; SDAG-NEXT: v_or_b32_e32 v11, v12, v0
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v2
+; SDAG-NEXT: v_or_b32_e32 v14, v9, v3
+; SDAG-NEXT: v_or_b32_e32 v9, v12, v0
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v2
; SDAG-NEXT: .LBB0_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT: v_xor_b32_e32 v3, v29, v28
-; SDAG-NEXT: v_xor_b32_e32 v2, v27, v26
-; SDAG-NEXT: v_xor_b32_e32 v7, v19, v18
+; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26
+; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24
+; SDAG-NEXT: v_xor_b32_e32 v7, v21, v20
; SDAG-NEXT: v_xor_b32_e32 v6, v17, v16
-; SDAG-NEXT: v_xor_b32_e32 v4, v20, v3
-; SDAG-NEXT: v_xor_b32_e32 v5, v21, v2
-; SDAG-NEXT: v_xor_b32_e32 v1, v22, v3
+; SDAG-NEXT: v_xor_b32_e32 v4, v18, v3
+; SDAG-NEXT: v_xor_b32_e32 v5, v22, v2
+; SDAG-NEXT: v_xor_b32_e32 v1, v19, v3
; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2
-; SDAG-NEXT: v_xor_b32_e32 v8, v13, v7
-; SDAG-NEXT: v_xor_b32_e32 v9, v11, v6
+; SDAG-NEXT: v_xor_b32_e32 v10, v13, v7
+; SDAG-NEXT: v_xor_b32_e32 v9, v9, v6
; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7
; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
-; SDAG-NEXT: v_xor_b32_e32 v4, v10, v6
+; SDAG-NEXT: v_xor_b32_e32 v4, v8, v6
; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v7, vcc
; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: v_sdiv_v2i128_vv:
; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_ashrrev_i32_e32 v24, 31, v3
; GISEL-NEXT: v_ashrrev_i32_e32 v25, 31, v11
-; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f
-; GISEL-NEXT: v_mov_b32_e32 v21, 0
+; GISEL-NEXT: v_mov_b32_e32 v16, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_xor_b32_e32 v0, v24, v0
; GISEL-NEXT: v_xor_b32_e32 v1, v24, v1
; GISEL-NEXT: v_xor_b32_e32 v2, v24, v2
@@ -438,71 +438,71 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_xor_b32_e32 v9, v25, v9
; GISEL-NEXT: v_xor_b32_e32 v10, v25, v10
; GISEL-NEXT: v_xor_b32_e32 v11, v25, v11
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v24
-; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v24, vcc
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v0, v24
+; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v1, v24, vcc
; GISEL-NEXT: v_sub_i32_e64 v26, s[4:5], v8, v25
; GISEL-NEXT: v_subb_u32_e64 v27, s[4:5], v9, v25, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v24, vcc
-; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v3, v24, vcc
+; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v24, vcc
+; GISEL-NEXT: v_subb_u32_e32 v21, vcc, v3, v24, vcc
; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v25, s[4:5]
; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v25, vcc
; GISEL-NEXT: v_ffbh_u32_e32 v8, v27
; GISEL-NEXT: v_ffbh_u32_e32 v9, v26
-; GISEL-NEXT: v_ffbh_u32_e32 v22, v17
-; GISEL-NEXT: v_ffbh_u32_e32 v23, v16
+; GISEL-NEXT: v_ffbh_u32_e32 v22, v18
+; GISEL-NEXT: v_ffbh_u32_e32 v23, v19
; GISEL-NEXT: v_or_b32_e32 v0, v26, v10
; GISEL-NEXT: v_or_b32_e32 v1, v27, v11
-; GISEL-NEXT: v_or_b32_e32 v2, v16, v18
-; GISEL-NEXT: v_or_b32_e32 v3, v17, v19
+; GISEL-NEXT: v_or_b32_e32 v2, v18, v20
+; GISEL-NEXT: v_or_b32_e32 v3, v19, v21
; GISEL-NEXT: v_add_i32_e32 v9, vcc, 32, v9
-; GISEL-NEXT: v_ffbh_u32_e32 v28, v11
-; GISEL-NEXT: v_ffbh_u32_e32 v29, v10
-; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23
-; GISEL-NEXT: v_ffbh_u32_e32 v30, v19
-; GISEL-NEXT: v_ffbh_u32_e32 v31, v18
+; GISEL-NEXT: v_add_i32_e32 v22, vcc, 32, v22
+; GISEL-NEXT: v_ffbh_u32_e32 v28, v10
+; GISEL-NEXT: v_ffbh_u32_e32 v29, v11
+; GISEL-NEXT: v_ffbh_u32_e32 v30, v20
+; GISEL-NEXT: v_ffbh_u32_e32 v31, v21
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
; GISEL-NEXT: v_min_u32_e32 v0, v8, v9
-; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v29
-; GISEL-NEXT: v_min_u32_e32 v2, v22, v23
-; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v31
+; GISEL-NEXT: v_min_u32_e32 v1, v23, v22
+; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v28
+; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v30
+; GISEL-NEXT: v_min_u32_e32 v2, v29, v2
+; GISEL-NEXT: v_min_u32_e32 v3, v31, v3
; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0
-; GISEL-NEXT: v_min_u32_e32 v1, v28, v1
-; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2
-; GISEL-NEXT: v_min_u32_e32 v3, v30, v3
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 64, v1
; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[20:21]
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v2
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[16:17]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
; GISEL-NEXT: v_or_b32_e32 v9, v3, v1
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_or_b32_e32 v9, v22, v20
-; GISEL-NEXT: v_and_b32_e32 v20, 1, v9
+; GISEL-NEXT: v_or_b32_e32 v9, v22, v16
; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, v16, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v22, 1, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v21, v17, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v8, v18, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v19, 0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v9, 1, v9
+; GISEL-NEXT: v_and_b32_e32 v8, 1, v8
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v22, v18, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v20, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v9, v21, 0, vcc
+; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
+; GISEL-NEXT: v_cndmask_b32_e64 v23, v19, 0, vcc
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB0_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v2
@@ -511,110 +511,111 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_not_b32_e32 v2, 63
; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v0, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v32, v2
+; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v32, v2
; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v32
-; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v32
-; GISEL-NEXT: v_lshl_b64 v[2:3], v[18:19], v32
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[18:19], v32
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], v32
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GISEL-NEXT: v_lshr_b64 v[8:9], v[16:17], v8
-; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20
+; GISEL-NEXT: v_lshr_b64 v[8:9], v[18:19], v8
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v16
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v32
-; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc
; GISEL-NEXT: v_or_b32_e32 v0, v8, v2
; GISEL-NEXT: v_or_b32_e32 v1, v9, v3
; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v18, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v19, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v20, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v21, vcc
; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_mov_b32_e32 v0, s8
; GISEL-NEXT: v_mov_b32_e32 v1, s9
; GISEL-NEXT: v_mov_b32_e32 v2, s10
; GISEL-NEXT: v_mov_b32_e32 v3, s11
-; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9]
; GISEL-NEXT: s_cbranch_execz .LBB0_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
-; GISEL-NEXT: v_add_i32_e32 v34, vcc, 0xffffffc0, v28
+; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v28
; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28
-; GISEL-NEXT: v_lshr_b64 v[0:1], v[18:19], v28
-; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v28
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28
+; GISEL-NEXT: v_lshr_b64 v[0:1], v[20:21], v28
+; GISEL-NEXT: v_lshr_b64 v[2:3], v[18:19], v28
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], v22
+; GISEL-NEXT: v_or_b32_e32 v22, v2, v22
+; GISEL-NEXT: v_or_b32_e32 v23, v3, v23
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_lshr_b64 v[2:3], v[20:21], v32
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v22, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v23, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v18, v2, v18, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v19, v3, v19, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc
; GISEL-NEXT: v_add_i32_e32 v32, vcc, -1, v26
-; GISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v28
; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v27, vcc
-; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v22
-; GISEL-NEXT: v_lshr_b64 v[36:37], v[18:19], v34
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, v0, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v10, vcc
-; GISEL-NEXT: v_or_b32_e32 v0, v2, v22
-; GISEL-NEXT: v_or_b32_e32 v1, v3, v23
; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v11, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v0, v36, v0, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v37, v1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v22, v0, v16, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v23, v1, v17, s[6:7]
-; GISEL-NEXT: v_mov_b32_e32 v17, 0
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v23, 0
; GISEL-NEXT: v_mov_b32_e32 v0, s8
; GISEL-NEXT: v_mov_b32_e32 v1, s9
; GISEL-NEXT: v_mov_b32_e32 v2, s10
; GISEL-NEXT: v_mov_b32_e32 v3, s11
; GISEL-NEXT: .LBB0_3: ; %udiv-do-while3
; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT: v_lshrrev_b32_e32 v16, 31, v21
+; GISEL-NEXT: v_lshrrev_b32_e32 v36, 31, v17
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[16:17], 1
+; GISEL-NEXT: v_or_b32_e32 v16, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v17, v1, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v19
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[18:19], 1
; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
-; GISEL-NEXT: v_lshl_b64 v[36:37], v[22:23], 1
-; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23
-; GISEL-NEXT: v_lshrrev_b32_e32 v23, 31, v9
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v22
+; GISEL-NEXT: v_lshrrev_b32_e32 v18, 31, v9
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v32, v0
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v33, v1, vcc
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v34, v2, vcc
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v35, v3, vcc
+; GISEL-NEXT: v_ashrrev_i32_e32 v22, 31, v18
+; GISEL-NEXT: v_and_b32_e32 v18, v22, v26
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v0, v18
+; GISEL-NEXT: v_and_b32_e32 v0, v22, v27
+; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v1, v0, vcc
+; GISEL-NEXT: v_and_b32_e32 v0, v22, v10
+; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v0, vcc
+; GISEL-NEXT: v_and_b32_e32 v0, v22, v11
+; GISEL-NEXT: v_subb_u32_e32 v21, vcc, v3, v0, vcc
; GISEL-NEXT: v_add_i32_e32 v28, vcc, -1, v28
; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc
-; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; GISEL-NEXT: v_or_b32_e32 v20, v0, v2
-; GISEL-NEXT: v_or_b32_e32 v21, v1, v3
-; GISEL-NEXT: v_or_b32_e32 v2, v18, v22
-; GISEL-NEXT: v_or_b32_e32 v3, v36, v23
; GISEL-NEXT: v_addc_u32_e32 v30, vcc, -1, v30, vcc
; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
-; GISEL-NEXT: v_or_b32_e32 v8, v8, v16
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v32, v3
-; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v37, vcc
; GISEL-NEXT: v_or_b32_e32 v0, v28, v30
; GISEL-NEXT: v_or_b32_e32 v1, v29, v31
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v34, v2, vcc
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v19, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v16
+; GISEL-NEXT: v_and_b32_e32 v22, 1, v22
+; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GISEL-NEXT: v_and_b32_e32 v1, v0, v26
-; GISEL-NEXT: v_and_b32_e32 v18, v0, v27
-; GISEL-NEXT: v_and_b32_e32 v16, 1, v0
-; GISEL-NEXT: v_and_b32_e32 v36, v0, v10
-; GISEL-NEXT: v_and_b32_e32 v0, v0, v11
-; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v3, v1
-; GISEL-NEXT: v_subb_u32_e32 v23, vcc, v37, v18, vcc
-; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v36, vcc
-; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v19, v0, vcc
-; GISEL-NEXT: v_mov_b32_e32 v0, v16
-; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v36
+; GISEL-NEXT: v_mov_b32_e32 v0, v22
+; GISEL-NEXT: v_mov_b32_e32 v1, v23
; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GISEL-NEXT: s_cbranch_execnz .LBB0_3
; GISEL-NEXT: ; %bb.4: ; %Flow13
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: .LBB0_5: ; %Flow14
-; GISEL-NEXT: s_or_b64 exec, exec, s[14:15]
-; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[16:17], 1
; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v21
+; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v17
; GISEL-NEXT: v_or_b32_e32 v8, v8, v10
-; GISEL-NEXT: v_or_b32_e32 v20, v0, v2
-; GISEL-NEXT: v_or_b32_e32 v21, v1, v3
+; GISEL-NEXT: v_or_b32_e32 v22, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v23, v1, v3
; GISEL-NEXT: .LBB0_6: ; %Flow16
-; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7
; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15
@@ -630,18 +631,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15
; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18
; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc
-; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], v4, v19
-; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], v5, v19, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], v4, v19
+; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], v5, v19, s[4:5]
; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc
; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc
; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5]
; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc
-; GISEL-NEXT: v_ffbh_u32_e32 v14, v23
-; GISEL-NEXT: v_ffbh_u32_e32 v15, v22
+; GISEL-NEXT: v_ffbh_u32_e32 v14, v21
+; GISEL-NEXT: v_ffbh_u32_e32 v15, v20
; GISEL-NEXT: v_ffbh_u32_e32 v16, v7
; GISEL-NEXT: v_ffbh_u32_e32 v17, v6
-; GISEL-NEXT: v_or_b32_e32 v0, v22, v4
-; GISEL-NEXT: v_or_b32_e32 v1, v23, v5
+; GISEL-NEXT: v_or_b32_e32 v0, v20, v4
+; GISEL-NEXT: v_or_b32_e32 v1, v21, v5
; GISEL-NEXT: v_or_b32_e32 v2, v6, v12
; GISEL-NEXT: v_or_b32_e32 v3, v7, v13
; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15
@@ -732,8 +733,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26
; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26
; GISEL-NEXT: s_mov_b64 s[4:5], 0
-; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v22
-; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v23, vcc
+; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v20
+; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v21, vcc
; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16
; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32
; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc
@@ -782,8 +783,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6
; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_and_b32_e32 v6, 1, v0
-; GISEL-NEXT: v_and_b32_e32 v12, v0, v22
-; GISEL-NEXT: v_and_b32_e32 v13, v0, v23
+; GISEL-NEXT: v_and_b32_e32 v12, v0, v20
+; GISEL-NEXT: v_and_b32_e32 v13, v0, v21
; GISEL-NEXT: v_and_b32_e32 v34, v0, v4
; GISEL-NEXT: v_and_b32_e32 v35, v0, v5
; GISEL-NEXT: v_mov_b32_e32 v0, v6
@@ -808,8 +809,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24
; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18
-; GISEL-NEXT: v_xor_b32_e32 v0, v20, v3
-; GISEL-NEXT: v_xor_b32_e32 v1, v21, v3
+; GISEL-NEXT: v_xor_b32_e32 v0, v22, v3
+; GISEL-NEXT: v_xor_b32_e32 v1, v23, v3
; GISEL-NEXT: v_xor_b32_e32 v2, v8, v3
; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3
; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7
@@ -853,11 +854,11 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22
; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24
; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
; SDAG-NEXT: v_min_u32_e32 v16, v16, v21
; SDAG-NEXT: v_min_u32_e32 v17, v17, v23
; SDAG-NEXT: v_min_u32_e32 v18, v18, v25
; SDAG-NEXT: v_min_u32_e32 v19, v19, v27
+; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19
@@ -868,146 +869,146 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
-; SDAG-NEXT: v_sub_i32_e32 v23, vcc, v16, v18
-; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v20, v17, vcc
-; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v23
-; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[23:24]
+; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v16, v18
+; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v20, v17, vcc
+; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v22
+; SDAG-NEXT: v_subbrev_u32_e32 v24, vcc, 0, v28, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23]
; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; SDAG-NEXT: v_subbrev_u32_e32 v26, vcc, 0, v28, vcc
-; SDAG-NEXT: v_or_b32_e32 v16, v16, v25
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[25:26]
+; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v24
+; SDAG-NEXT: v_or_b32_e32 v17, v23, v25
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25]
; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v17, v24, v26
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26]
-; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_and_b32_e32 v16, 1, v18
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[24:25]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, v18, s[4:5]
+; SDAG-NEXT: v_and_b32_e32 v16, 1, v16
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; SDAG-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB1_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
-; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v23
-; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v23
+; SDAG-NEXT: v_add_i32_e32 v26, vcc, 1, v22
+; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v22
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: v_mov_b32_e32 v21, 0
-; SDAG-NEXT: v_mov_b32_e32 v22, 0
-; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v24, vcc
+; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v23, vcc
; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16
-; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v25, vcc
-; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v26, vcc
-; SDAG-NEXT: v_or_b32_e32 v19, v18, v28
-; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v23
-; SDAG-NEXT: v_or_b32_e32 v20, v27, v29
-; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v30
+; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v24, vcc
+; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v25, vcc
+; SDAG-NEXT: v_or_b32_e32 v18, v26, v28
+; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v22
+; SDAG-NEXT: v_or_b32_e32 v19, v27, v29
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v30
; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v30
-; SDAG-NEXT: v_lshl_b64 v[25:26], v[0:1], v30
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20]
-; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v31
-; SDAG-NEXT: v_or_b32_e32 v20, v24, v20
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v30
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v31
; SDAG-NEXT: v_or_b32_e32 v19, v23, v19
+; SDAG-NEXT: v_or_b32_e32 v18, v22, v18
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v19, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v26, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v17, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v16, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v25, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v24, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
-; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB1_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v18
-; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v18
-; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v18
-; SDAG-NEXT: v_lshr_b64 v[32:33], v[2:3], v18
+; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v26
+; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 64, v26
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v22
+; SDAG-NEXT: v_or_b32_e32 v23, v21, v23
+; SDAG-NEXT: v_or_b32_e32 v22, v20, v22
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
+; SDAG-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v26
+; SDAG-NEXT: v_lshr_b64 v[20:21], v[2:3], v20
+; SDAG-NEXT: v_cndmask_b32_e32 v21, v21, v23, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v21, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v20, v0, s[4:5]
+; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v26
+; SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v8
-; SDAG-NEXT: s_mov_b64 s[12:13], 0
-; SDAG-NEXT: v_mov_b32_e32 v25, 0
-; SDAG-NEXT: v_mov_b32_e32 v26, 0
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
-; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18
-; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v31
-; SDAG-NEXT: v_lshr_b64 v[36:37], v[2:3], v36
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v33, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v32, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v22, v22, v35
-; SDAG-NEXT: v_or_b32_e32 v21, v21, v34
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v37, v22, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v22, v1, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v21, v0, s[6:7]
+; SDAG-NEXT: s_mov_b64 s[4:5], 0
+; SDAG-NEXT: v_mov_b32_e32 v24, 0
+; SDAG-NEXT: v_mov_b32_e32 v25, 0
; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: v_mov_b32_e32 v23, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshrrev_b32_e32 v21, 31, v24
-; SDAG-NEXT: v_lshl_b64 v[23:24], v[23:24], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_or_b32_e32 v17, v25, v17
+; SDAG-NEXT: v_or_b32_e32 v16, v24, v16
; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v1
; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v35, 31, v17
-; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
-; SDAG-NEXT: v_or_b32_e32 v24, v26, v24
-; SDAG-NEXT: v_or_b32_e32 v23, v25, v23
-; SDAG-NEXT: v_or_b32_e32 v2, v2, v34
-; SDAG-NEXT: v_or_b32_e32 v0, v0, v35
-; SDAG-NEXT: v_or_b32_e32 v16, v16, v21
-; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v30, v0
-; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v31, v1, vcc
-; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v32, v2, vcc
-; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v33, v3, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v21
-; SDAG-NEXT: v_and_b32_e32 v25, v21, v8
-; SDAG-NEXT: v_and_b32_e32 v26, v21, v9
-; SDAG-NEXT: v_and_b32_e32 v34, v21, v10
-; SDAG-NEXT: v_and_b32_e32 v35, v21, v11
-; SDAG-NEXT: v_and_b32_e32 v21, 1, v21
-; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v25
-; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v26, vcc
-; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v34, vcc
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v35, vcc
-; SDAG-NEXT: v_add_i32_e32 v18, vcc, -1, v18
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v20
+; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v19
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v20
+; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v30, v0
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v31, v1, vcc
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v32, v2, vcc
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v33, v3, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v20, 31, v20
+; SDAG-NEXT: v_and_b32_e32 v24, v20, v8
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v24
+; SDAG-NEXT: v_and_b32_e32 v24, v20, v9
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v24, vcc
+; SDAG-NEXT: v_and_b32_e32 v24, v20, v10
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v24, vcc
+; SDAG-NEXT: v_and_b32_e32 v24, v20, v11
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v24, vcc
+; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v26
; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc
; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc
; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc
-; SDAG-NEXT: v_or_b32_e32 v25, v18, v28
-; SDAG-NEXT: v_or_b32_e32 v26, v27, v29
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26]
-; SDAG-NEXT: v_or_b32_e32 v17, v20, v17
-; SDAG-NEXT: s_or_b64 s[12:13], vcc, s[12:13]
-; SDAG-NEXT: v_or_b32_e32 v16, v19, v16
-; SDAG-NEXT: v_mov_b32_e32 v26, v22
+; SDAG-NEXT: v_or_b32_e32 v24, v26, v28
+; SDAG-NEXT: v_or_b32_e32 v25, v27, v29
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25]
+; SDAG-NEXT: v_and_b32_e32 v20, 1, v20
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v34
+; SDAG-NEXT: v_or_b32_e32 v19, v23, v19
+; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v18, v22, v18
; SDAG-NEXT: v_mov_b32_e32 v25, v21
-; SDAG-NEXT: s_andn2_b64 exec, exec, s[12:13]
+; SDAG-NEXT: v_mov_b32_e32 v24, v20
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
; SDAG-NEXT: s_cbranch_execnz .LBB1_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
-; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; SDAG-NEXT: .LBB1_5: ; %Flow14
-; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: v_lshl_b64 v[0:1], v[16:17], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v24
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[23:24], 1
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[18:19], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], 1
; SDAG-NEXT: v_or_b32_e32 v0, v0, v8
-; SDAG-NEXT: v_or_b32_e32 v16, v20, v1
-; SDAG-NEXT: v_or_b32_e32 v18, v22, v3
-; SDAG-NEXT: v_or_b32_e32 v17, v19, v0
-; SDAG-NEXT: v_or_b32_e32 v19, v21, v2
+; SDAG-NEXT: v_or_b32_e32 v16, v23, v1
+; SDAG-NEXT: v_or_b32_e32 v18, v21, v3
+; SDAG-NEXT: v_or_b32_e32 v17, v22, v0
+; SDAG-NEXT: v_or_b32_e32 v19, v20, v2
; SDAG-NEXT: .LBB1_6: ; %Flow16
-; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v1, v13, v15
; SDAG-NEXT: v_or_b32_e32 v0, v12, v14
; SDAG-NEXT: v_or_b32_e32 v3, v5, v7
@@ -1045,20 +1046,20 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc
-; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v0
-; SDAG-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v24, vcc
+; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v0
+; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v24, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; SDAG-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v24, vcc
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v2
+; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v24, vcc
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v20
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v3, v1, v21
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v9, v1, v3
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_and_b32_e32 v8, 1, v10
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
+; SDAG-NEXT: v_and_b32_e32 v2, 1, v8
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
@@ -1069,118 +1070,118 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB1_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT: v_add_i32_e32 v8, vcc, 1, v0
-; SDAG-NEXT: v_sub_i32_e64 v9, s[4:5], 63, v0
+; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v0
+; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[4:5], v8
+; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v20, vcc
+; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v21, vcc
+; SDAG-NEXT: v_or_b32_e32 v10, v22, v24
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v0
+; SDAG-NEXT: v_or_b32_e32 v11, v23, v25
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], v26
+; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v26
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v27
+; SDAG-NEXT: v_or_b32_e32 v1, v1, v11
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v10
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v21, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v20, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: v_mov_b32_e32 v21, 0
-; SDAG-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; SDAG-NEXT: v_lshl_b64 v[9:10], v[4:5], v9
-; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
-; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v3, vcc
-; SDAG-NEXT: v_or_b32_e32 v1, v8, v24
-; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0x7f, v0
-; SDAG-NEXT: v_or_b32_e32 v2, v11, v25
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[6:7], v3
-; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 64, v3
-; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v3
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[1:2]
-; SDAG-NEXT: v_lshr_b64 v[0:1], v[4:5], v0
-; SDAG-NEXT: v_or_b32_e32 v1, v23, v1
-; SDAG-NEXT: v_or_b32_e32 v0, v22, v0
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v27, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v26, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, v7, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v9, v6, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v9, 0
-; SDAG-NEXT: v_mov_b32_e32 v10, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB1_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
-; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v8
-; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v8
-; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8
-; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v8
+; SDAG-NEXT: v_lshr_b64 v[2:3], v[4:5], v22
+; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v22
+; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v22
+; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v22
; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v12
; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v22, 0
-; SDAG-NEXT: v_mov_b32_e32 v23, 0
-; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: v_mov_b32_e32 v10, 0
+; SDAG-NEXT: v_mov_b32_e32 v11, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v27
; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v28
; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc
-; SDAG-NEXT: v_or_b32_e32 v21, v21, v32
-; SDAG-NEXT: v_or_b32_e32 v20, v20, v31
+; SDAG-NEXT: v_or_b32_e32 v3, v3, v32
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v31
; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v8
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v7, v21, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v6, v20, s[4:5]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v30, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v29, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
-; SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc
-; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22
+; SDAG-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc
+; SDAG-NEXT: v_mov_b32_e32 v3, 0
; SDAG-NEXT: .LBB1_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v5
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v5
; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v3
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v1
; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; SDAG-NEXT: v_or_b32_e32 v6, v6, v20
-; SDAG-NEXT: v_or_b32_e32 v4, v4, v30
-; SDAG-NEXT: v_or_b32_e32 v2, v2, v31
-; SDAG-NEXT: v_or_b32_e32 v3, v10, v3
-; SDAG-NEXT: v_or_b32_e32 v1, v23, v1
-; SDAG-NEXT: v_or_b32_e32 v2, v9, v2
-; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v26, v4
-; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v27, v5, vcc
-; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v28, v6, vcc
-; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v29, v7, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v23, 31, v20
-; SDAG-NEXT: v_and_b32_e32 v20, 1, v23
-; SDAG-NEXT: v_and_b32_e32 v30, v23, v15
-; SDAG-NEXT: v_and_b32_e32 v31, v23, v14
-; SDAG-NEXT: v_and_b32_e32 v32, v23, v13
-; SDAG-NEXT: v_and_b32_e32 v23, v23, v12
-; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v23
-; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v32, vcc
-; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v31, vcc
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v30, vcc
-; SDAG-NEXT: v_add_i32_e32 v8, vcc, -1, v8
-; SDAG-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc
+; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v9
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v2
+; SDAG-NEXT: v_or_b32_e32 v2, v4, v30
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v31
+; SDAG-NEXT: v_or_b32_e32 v1, v21, v1
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v26, v2
+; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v27, v5, vcc
+; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v28, v6, vcc
+; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v29, v7, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v30, 31, v4
+; SDAG-NEXT: v_and_b32_e32 v31, v30, v13
+; SDAG-NEXT: v_and_b32_e32 v4, v30, v12
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v2, v4
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v31, vcc
+; SDAG-NEXT: v_or_b32_e32 v9, v11, v9
+; SDAG-NEXT: v_or_b32_e32 v0, v20, v0
+; SDAG-NEXT: v_and_b32_e32 v2, 1, v30
+; SDAG-NEXT: v_and_b32_e32 v11, v30, v15
+; SDAG-NEXT: v_and_b32_e32 v30, v30, v14
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v30, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc
+; SDAG-NEXT: v_add_i32_e32 v22, vcc, -1, v22
+; SDAG-NEXT: v_addc_u32_e32 v23, vcc, -1, v23, vcc
; SDAG-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc
; SDAG-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc
-; SDAG-NEXT: v_or_b32_e32 v31, v11, v25
-; SDAG-NEXT: v_or_b32_e32 v30, v8, v24
+; SDAG-NEXT: v_or_b32_e32 v31, v23, v25
+; SDAG-NEXT: v_or_b32_e32 v30, v22, v24
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31]
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v0, v22, v0
-; SDAG-NEXT: v_mov_b32_e32 v23, v21
-; SDAG-NEXT: v_mov_b32_e32 v22, v20
+; SDAG-NEXT: v_or_b32_e32 v8, v10, v8
+; SDAG-NEXT: v_mov_b32_e32 v11, v3
+; SDAG-NEXT: v_mov_b32_e32 v10, v2
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB1_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB1_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v1
; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; SDAG-NEXT: v_or_b32_e32 v2, v2, v4
-; SDAG-NEXT: v_or_b32_e32 v8, v10, v3
-; SDAG-NEXT: v_or_b32_e32 v10, v21, v1
-; SDAG-NEXT: v_or_b32_e32 v9, v9, v2
-; SDAG-NEXT: v_or_b32_e32 v11, v20, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v9
+; SDAG-NEXT: v_lshl_b64 v[4:5], v[8:9], 1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v6
+; SDAG-NEXT: v_or_b32_e32 v8, v21, v1
+; SDAG-NEXT: v_or_b32_e32 v10, v3, v5
+; SDAG-NEXT: v_or_b32_e32 v9, v20, v0
+; SDAG-NEXT: v_or_b32_e32 v11, v2, v4
; SDAG-NEXT: .LBB1_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v0, v19
@@ -1198,7 +1199,6 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v16, v2
; GISEL-NEXT: v_mov_b32_e32 v17, v3
-; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_or_b32_e32 v2, v8, v10
; GISEL-NEXT: v_or_b32_e32 v3, v9, v11
; GISEL-NEXT: v_or_b32_e32 v18, v0, v16
@@ -1209,20 +1209,21 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_ffbh_u32_e32 v23, v10
; GISEL-NEXT: v_ffbh_u32_e32 v26, v1
; GISEL-NEXT: v_ffbh_u32_e32 v27, v0
-; GISEL-NEXT: v_ffbh_u32_e32 v28, v17
-; GISEL-NEXT: v_ffbh_u32_e32 v29, v16
+; GISEL-NEXT: v_ffbh_u32_e32 v28, v16
+; GISEL-NEXT: v_ffbh_u32_e32 v29, v17
; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f
; GISEL-NEXT: v_mov_b32_e32 v25, 0
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v21
; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v23
; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27
-; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29
+; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v28
; GISEL-NEXT: v_min_u32_e32 v2, v20, v2
; GISEL-NEXT: v_min_u32_e32 v3, v22, v3
; GISEL-NEXT: v_min_u32_e32 v18, v26, v18
-; GISEL-NEXT: v_min_u32_e32 v19, v28, v19
+; GISEL-NEXT: v_min_u32_e32 v19, v29, v19
; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v2, vcc, 64, v2
@@ -1235,28 +1236,28 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v22
; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25]
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v22
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[20:21]
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v2, v2, v20
; GISEL-NEXT: v_or_b32_e32 v3, v23, v21
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[20:21]
+; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v3, v26, v18
-; GISEL-NEXT: v_and_b32_e32 v18, 1, v3
; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT: v_and_b32_e32 v3, 1, v3
+; GISEL-NEXT: v_and_b32_e32 v2, 1, v2
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v24, 1, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
+; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB1_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -1560,12 +1561,12 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT: v_mov_b32_e32 v29, v28
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
+; SDAG-NEXT: v_mov_b32_e32 v29, v28
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5]
@@ -1574,106 +1575,106 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5]
; SDAG-NEXT: v_ffbh_u32_e32 v18, v16
; SDAG-NEXT: v_ffbh_u32_e32 v20, v17
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; SDAG-NEXT: v_or_b32_e32 v2, v16, v0
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v0
; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v0
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
; SDAG-NEXT: v_or_b32_e32 v3, v17, v1
-; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v22
-; SDAG-NEXT: v_ffbh_u32_e32 v24, v1
; SDAG-NEXT: v_min_u32_e32 v18, v18, v20
-; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v10, vcc
+; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v1
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11]
; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
-; SDAG-NEXT: v_min_u32_e32 v3, v22, v24
+; SDAG-NEXT: v_min_u32_e32 v3, v20, v22
; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18
-; SDAG-NEXT: v_addc_u32_e64 v9, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v11, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v20, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v10, v31
-; SDAG-NEXT: v_ffbh_u32_e32 v20, v30
+; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5]
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v22, v8, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v9, v31
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v30
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5]
; SDAG-NEXT: v_or_b32_e32 v8, v31, v2
-; SDAG-NEXT: v_ffbh_u32_e32 v11, v2
-; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10
+; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v2
; SDAG-NEXT: v_or_b32_e32 v9, v30, v3
-; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11
-; SDAG-NEXT: v_ffbh_u32_e32 v18, v3
-; SDAG-NEXT: v_min_u32_e32 v10, v10, v20
+; SDAG-NEXT: v_min_u32_e32 v11, v11, v21
+; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v3
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_min_u32_e32 v8, v11, v18
-; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v10
-; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_min_u32_e32 v8, v20, v21
+; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11
+; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5]
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v22
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v10, v21, vcc
-; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v8
+; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc
+; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10
; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v18
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v18
+; SDAG-NEXT: v_or_b32_e32 v9, v11, v19
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v11, v9, v19
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_and_b32_e32 v10, 1, v20
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v21, v20, s[4:5]
+; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v35, v1, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB2_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
-; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v8
-; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8
-; SDAG-NEXT: v_mov_b32_e32 v10, 0
-; SDAG-NEXT: v_mov_b32_e32 v11, 0
-; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc
+; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10
+; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20
; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc
; SDAG-NEXT: v_or_b32_e32 v18, v32, v34
-; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v8
+; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10
; SDAG-NEXT: v_or_b32_e32 v19, v33, v35
-; SDAG-NEXT: v_lshl_b64 v[8:9], v[0:1], v24
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24
; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24
; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25
-; SDAG-NEXT: v_or_b32_e32 v9, v9, v19
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v18
+; SDAG-NEXT: v_or_b32_e32 v11, v11, v19
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v18
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB2_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v32
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v32
; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v32
; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32
; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32
@@ -1686,73 +1687,73 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v26
; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc
-; SDAG-NEXT: v_or_b32_e32 v11, v11, v27
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v26
+; SDAG-NEXT: v_or_b32_e32 v9, v9, v27
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v26
; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v2, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v49, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v48, v10, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v49, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v48, v8, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v3, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
-; SDAG-NEXT: v_cndmask_b32_e32 v25, v11, v17, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v24, v10, v16, vcc
-; SDAG-NEXT: v_mov_b32_e32 v11, 0
+; SDAG-NEXT: v_cndmask_b32_e32 v25, v9, v17, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v24, v8, v16, vcc
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v25
+; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v25
; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v9
-; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v21
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT: v_or_b32_e32 v26, v26, v10
-; SDAG-NEXT: v_or_b32_e32 v24, v24, v48
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v49
-; SDAG-NEXT: v_or_b32_e32 v9, v19, v9
-; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v36, v24
-; SDAG-NEXT: v_or_b32_e32 v8, v18, v8
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v25, vcc
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v38, v26, vcc
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v39, v27, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v10, 31, v10
-; SDAG-NEXT: v_and_b32_e32 v48, v10, v31
-; SDAG-NEXT: v_and_b32_e32 v49, v10, v30
-; SDAG-NEXT: v_and_b32_e32 v50, v10, v2
-; SDAG-NEXT: v_and_b32_e32 v51, v10, v3
-; SDAG-NEXT: v_and_b32_e32 v10, 1, v10
-; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v24, v48
-; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc
-; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v26, v50, vcc
-; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v51, vcc
+; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v11
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_or_b32_e32 v21, v23, v21
+; SDAG-NEXT: v_or_b32_e32 v20, v22, v20
+; SDAG-NEXT: v_or_b32_e32 v22, v26, v48
+; SDAG-NEXT: v_or_b32_e32 v23, v24, v49
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v8
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v36, v23
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v25, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v38, v22, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v39, v27, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; SDAG-NEXT: v_and_b32_e32 v24, v8, v31
+; SDAG-NEXT: v_and_b32_e32 v26, v8, v30
+; SDAG-NEXT: v_and_b32_e32 v48, v8, v2
+; SDAG-NEXT: v_and_b32_e32 v49, v8, v3
+; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
+; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v23, v24
+; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v26, vcc
+; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v22, v48, vcc
+; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v49, vcc
; SDAG-NEXT: v_add_i32_e32 v32, vcc, -1, v32
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
; SDAG-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc
-; SDAG-NEXT: v_or_b32_e32 v48, v32, v34
-; SDAG-NEXT: v_or_b32_e32 v49, v33, v35
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[48:49]
-; SDAG-NEXT: v_or_b32_e32 v21, v23, v21
+; SDAG-NEXT: v_or_b32_e32 v22, v32, v34
+; SDAG-NEXT: v_or_b32_e32 v23, v33, v35
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; SDAG-NEXT: v_or_b32_e32 v11, v19, v11
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v20, v22, v20
-; SDAG-NEXT: v_mov_b32_e32 v23, v11
-; SDAG-NEXT: v_mov_b32_e32 v22, v10
+; SDAG-NEXT: v_or_b32_e32 v10, v18, v10
+; SDAG-NEXT: v_mov_b32_e32 v23, v9
+; SDAG-NEXT: v_mov_b32_e32 v22, v8
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB2_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB2_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v22
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v22
-; SDAG-NEXT: v_or_b32_e32 v35, v19, v9
-; SDAG-NEXT: v_or_b32_e32 v27, v11, v21
-; SDAG-NEXT: v_or_b32_e32 v32, v18, v8
-; SDAG-NEXT: v_or_b32_e32 v33, v10, v20
+; SDAG-NEXT: v_or_b32_e32 v35, v19, v11
+; SDAG-NEXT: v_or_b32_e32 v32, v18, v10
+; SDAG-NEXT: v_or_b32_e32 v27, v9, v21
+; SDAG-NEXT: v_or_b32_e32 v33, v8, v20
; SDAG-NEXT: .LBB2_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7
@@ -2025,28 +2026,28 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-LABEL: v_srem_v2i128_vv:
; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_ashrrev_i32_e32 v28, 31, v3
-; GISEL-NEXT: v_ashrrev_i32_e32 v20, 31, v11
-; GISEL-NEXT: v_mov_b32_e32 v18, 0x7f
-; GISEL-NEXT: v_mov_b32_e32 v19, 0
+; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v11
+; GISEL-NEXT: v_mov_b32_e32 v19, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v20, 0
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v28
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28
-; GISEL-NEXT: v_xor_b32_e32 v8, v8, v20
-; GISEL-NEXT: v_xor_b32_e32 v9, v9, v20
-; GISEL-NEXT: v_xor_b32_e32 v10, v10, v20
-; GISEL-NEXT: v_xor_b32_e32 v11, v11, v20
+; GISEL-NEXT: v_xor_b32_e32 v8, v8, v18
+; GISEL-NEXT: v_xor_b32_e32 v9, v9, v18
+; GISEL-NEXT: v_xor_b32_e32 v10, v10, v18
+; GISEL-NEXT: v_xor_b32_e32 v11, v11, v18
; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v28
; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v28, vcc
-; GISEL-NEXT: v_sub_i32_e64 v30, s[4:5], v8, v20
-; GISEL-NEXT: v_subb_u32_e64 v29, s[4:5], v9, v20, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v30, s[4:5], v8, v18
+; GISEL-NEXT: v_subb_u32_e64 v29, s[4:5], v9, v18, s[4:5]
; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v2, v28, vcc
; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v3, v28, vcc
-; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v20, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc
-; GISEL-NEXT: v_ffbh_u32_e32 v20, v29
+; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v18, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v18, v29
; GISEL-NEXT: v_ffbh_u32_e32 v21, v30
; GISEL-NEXT: v_ffbh_u32_e32 v22, v17
; GISEL-NEXT: v_ffbh_u32_e32 v23, v16
@@ -2055,53 +2056,53 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v2, v16, v8
; GISEL-NEXT: v_or_b32_e32 v3, v17, v9
; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21
-; GISEL-NEXT: v_ffbh_u32_e32 v24, v11
-; GISEL-NEXT: v_ffbh_u32_e32 v25, v10
; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23
-; GISEL-NEXT: v_ffbh_u32_e32 v26, v9
-; GISEL-NEXT: v_ffbh_u32_e32 v27, v8
+; GISEL-NEXT: v_ffbh_u32_e32 v24, v10
+; GISEL-NEXT: v_ffbh_u32_e32 v25, v11
+; GISEL-NEXT: v_ffbh_u32_e32 v26, v8
+; GISEL-NEXT: v_ffbh_u32_e32 v27, v9
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
-; GISEL-NEXT: v_min_u32_e32 v0, v20, v21
-; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v25
-; GISEL-NEXT: v_min_u32_e32 v2, v22, v23
-; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v27
+; GISEL-NEXT: v_min_u32_e32 v0, v18, v21
+; GISEL-NEXT: v_min_u32_e32 v1, v22, v23
+; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v24
+; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v26
+; GISEL-NEXT: v_min_u32_e32 v2, v25, v2
+; GISEL-NEXT: v_min_u32_e32 v3, v27, v3
; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0
-; GISEL-NEXT: v_min_u32_e32 v1, v24, v1
-; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2
-; GISEL-NEXT: v_min_u32_e32 v3, v26, v3
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 64, v1
; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v2
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[19:20]
+; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v18, v18, v0
; GISEL-NEXT: v_or_b32_e32 v19, v3, v1
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_or_b32_e32 v19, v20, v21
-; GISEL-NEXT: v_and_b32_e32 v20, 1, v19
+; GISEL-NEXT: v_or_b32_e32 v19, v21, v20
; GISEL-NEXT: v_or_b32_e32 v18, v19, v18
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_and_b32_e32 v19, 1, v19
+; GISEL-NEXT: v_and_b32_e32 v18, 1, v18
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v20, 1, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v18
; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
+; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB2_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -2154,11 +2155,11 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v31
; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
-; GISEL-NEXT: v_cndmask_b32_e32 v24, v2, v16, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v25, v3, v17, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v26, v2, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v27, v3, v17, vcc
; GISEL-NEXT: v_mov_b32_e32 v23, 0
; GISEL-NEXT: v_mov_b32_e32 v0, s4
; GISEL-NEXT: v_mov_b32_e32 v1, s5
@@ -2166,40 +2167,40 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v3, s7
; GISEL-NEXT: .LBB2_3: ; %udiv-do-while3
; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21
; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21
-; GISEL-NEXT: v_lshl_b64 v[48:49], v[24:25], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v27
; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25
-; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v19
-; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v48, 31, v19
; GISEL-NEXT: v_add_i32_e32 v31, vcc, -1, v31
; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
; GISEL-NEXT: v_or_b32_e32 v20, v0, v2
; GISEL-NEXT: v_or_b32_e32 v21, v1, v3
-; GISEL-NEXT: v_or_b32_e32 v2, v26, v24
-; GISEL-NEXT: v_or_b32_e32 v3, v48, v25
-; GISEL-NEXT: v_or_b32_e32 v18, v18, v22
+; GISEL-NEXT: v_or_b32_e32 v2, v24, v22
+; GISEL-NEXT: v_or_b32_e32 v3, v26, v48
; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v35, v3
-; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v36, v49, vcc
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v36, v27, vcc
; GISEL-NEXT: v_or_b32_e32 v0, v31, v33
; GISEL-NEXT: v_or_b32_e32 v1, v32, v34
; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v37, v2, vcc
-; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v38, v27, vcc
+; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v38, v25, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22
; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GISEL-NEXT: v_and_b32_e32 v22, 1, v0
; GISEL-NEXT: v_and_b32_e32 v1, v0, v30
-; GISEL-NEXT: v_and_b32_e32 v25, v0, v29
-; GISEL-NEXT: v_and_b32_e32 v26, v0, v10
-; GISEL-NEXT: v_and_b32_e32 v0, v0, v11
-; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1
-; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v49, v25, vcc
-; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc
-; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v0, vcc
+; GISEL-NEXT: v_and_b32_e32 v24, v0, v29
+; GISEL-NEXT: v_and_b32_e32 v48, v0, v10
+; GISEL-NEXT: v_and_b32_e32 v49, v0, v11
+; GISEL-NEXT: v_and_b32_e32 v22, 1, v0
+; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v3, v1
+; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v24, vcc
+; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v2, v48, vcc
+; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc
+; GISEL-NEXT: v_or_b32_e32 v18, v18, v39
; GISEL-NEXT: v_mov_b32_e32 v0, v22
; GISEL-NEXT: v_mov_b32_e32 v1, v23
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
@@ -2486,11 +2487,11 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22
; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24
; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
; SDAG-NEXT: v_min_u32_e32 v16, v16, v21
; SDAG-NEXT: v_min_u32_e32 v17, v17, v23
; SDAG-NEXT: v_min_u32_e32 v18, v18, v25
; SDAG-NEXT: v_min_u32_e32 v19, v19, v27
+; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19
@@ -2501,65 +2502,65 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
-; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18
-; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc
-; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
+; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc
+; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18
; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19]
; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc
-; SDAG-NEXT: v_or_b32_e32 v18, v18, v20
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
+; SDAG-NEXT: v_or_b32_e32 v17, v19, v21
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v19, v17, v21
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_and_b32_e32 v18, 1, v22
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[20:21]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v23, v22, s[4:5]
+; SDAG-NEXT: v_and_b32_e32 v16, 1, v16
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB3_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
-; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v16
-; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
-; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v17, vcc
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v18
+; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v18
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v19, vcc
; SDAG-NEXT: v_lshl_b64 v[22:23], v[0:1], v22
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v20, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v21, vcc
-; SDAG-NEXT: v_or_b32_e32 v20, v30, v32
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16
-; SDAG-NEXT: v_or_b32_e32 v21, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[16:17], v[2:3], v26
-; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v26
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v27
-; SDAG-NEXT: v_or_b32_e32 v17, v17, v21
-; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v24, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v19, v30, v32
+; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0x7f, v18
+; SDAG-NEXT: v_or_b32_e32 v20, v31, v33
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v21
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v21
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v21
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20]
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v18
+; SDAG-NEXT: v_or_b32_e32 v19, v25, v19
+; SDAG-NEXT: v_or_b32_e32 v18, v24, v18
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v21
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v27, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v26, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v21
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB3_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v30
+; SDAG-NEXT: v_lshr_b64 v[16:17], v[0:1], v30
; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v30
; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30
; SDAG-NEXT: v_lshr_b64 v[26:27], v[2:3], v30
@@ -2572,73 +2573,73 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v28
; SDAG-NEXT: v_lshr_b64 v[37:38], v[2:3], v35
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc
-; SDAG-NEXT: v_or_b32_e32 v19, v19, v29
-; SDAG-NEXT: v_or_b32_e32 v18, v18, v28
+; SDAG-NEXT: v_or_b32_e32 v17, v17, v29
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v28
; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v38, v19, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v37, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v38, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v37, v16, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
-; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_cndmask_b32_e32 v27, v17, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v26, v16, v0, vcc
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v23
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27
+; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v27
; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v17
-; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v23
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
-; SDAG-NEXT: v_or_b32_e32 v28, v28, v18
-; SDAG-NEXT: v_or_b32_e32 v26, v26, v38
-; SDAG-NEXT: v_or_b32_e32 v16, v16, v39
-; SDAG-NEXT: v_or_b32_e32 v17, v21, v17
-; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v34, v26
-; SDAG-NEXT: v_or_b32_e32 v16, v20, v16
-; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v35, v27, vcc
-; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v36, v28, vcc
-; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v37, v29, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v18
-; SDAG-NEXT: v_and_b32_e32 v39, v38, v8
-; SDAG-NEXT: v_and_b32_e32 v48, v38, v9
-; SDAG-NEXT: v_and_b32_e32 v49, v38, v10
-; SDAG-NEXT: v_and_b32_e32 v18, 1, v38
-; SDAG-NEXT: v_and_b32_e32 v38, v38, v11
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v39
-; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc
-; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v49, vcc
-; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v38, vcc
+; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v19
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT: v_or_b32_e32 v23, v25, v23
+; SDAG-NEXT: v_or_b32_e32 v22, v24, v22
+; SDAG-NEXT: v_or_b32_e32 v24, v28, v38
+; SDAG-NEXT: v_or_b32_e32 v25, v26, v39
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v16
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v34, v25
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v35, v27, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v24, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v29, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v16
+; SDAG-NEXT: v_and_b32_e32 v26, v16, v8
+; SDAG-NEXT: v_and_b32_e32 v28, v16, v9
+; SDAG-NEXT: v_and_b32_e32 v38, v16, v10
+; SDAG-NEXT: v_and_b32_e32 v39, v16, v11
+; SDAG-NEXT: v_and_b32_e32 v16, 1, v16
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v25, v26
+; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v28, vcc
+; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v24, v38, vcc
+; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v39, vcc
; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
-; SDAG-NEXT: v_or_b32_e32 v38, v30, v32
-; SDAG-NEXT: v_or_b32_e32 v39, v31, v33
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39]
-; SDAG-NEXT: v_or_b32_e32 v23, v25, v23
+; SDAG-NEXT: v_or_b32_e32 v24, v30, v32
+; SDAG-NEXT: v_or_b32_e32 v25, v31, v33
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25]
+; SDAG-NEXT: v_or_b32_e32 v19, v21, v19
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v22, v24, v22
-; SDAG-NEXT: v_mov_b32_e32 v25, v19
-; SDAG-NEXT: v_mov_b32_e32 v24, v18
+; SDAG-NEXT: v_or_b32_e32 v18, v20, v18
+; SDAG-NEXT: v_mov_b32_e32 v25, v17
+; SDAG-NEXT: v_mov_b32_e32 v24, v16
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB3_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB3_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23
; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
-; SDAG-NEXT: v_or_b32_e32 v16, v16, v24
-; SDAG-NEXT: v_or_b32_e32 v33, v21, v17
-; SDAG-NEXT: v_or_b32_e32 v30, v19, v23
-; SDAG-NEXT: v_or_b32_e32 v31, v20, v16
-; SDAG-NEXT: v_or_b32_e32 v32, v18, v22
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v24
+; SDAG-NEXT: v_or_b32_e32 v33, v21, v19
+; SDAG-NEXT: v_or_b32_e32 v30, v17, v23
+; SDAG-NEXT: v_or_b32_e32 v31, v20, v18
+; SDAG-NEXT: v_or_b32_e32 v32, v16, v22
; SDAG-NEXT: .LBB3_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v17, v13, v15
@@ -2678,63 +2679,63 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc
-; SDAG-NEXT: v_xor_b32_e32 v20, 0x7f, v16
-; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v28, vcc
+; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16
+; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v28, vcc
-; SDAG-NEXT: v_or_b32_e32 v20, v20, v18
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v20
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v21, v17, v19
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_or_b32_e32 v19, v17, v21
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT: v_and_b32_e32 v20, 1, v22
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_and_b32_e32 v18, 1, v22
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v5, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v4, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB3_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16
; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16
-; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v17, vcc
; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22
-; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v18, vcc
-; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v19, vcc
-; SDAG-NEXT: v_or_b32_e32 v17, v34, v36
-; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0x7f, v16
-; SDAG-NEXT: v_or_b32_e32 v18, v35, v37
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v19
-; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v19
-; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v19
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[17:18]
-; SDAG-NEXT: v_lshr_b64 v[16:17], v[4:5], v16
-; SDAG-NEXT: v_or_b32_e32 v17, v25, v17
-; SDAG-NEXT: v_or_b32_e32 v16, v24, v16
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v16, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v27, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v26, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v7, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v6, s[4:5]
+; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v20, vcc
+; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v21, vcc
+; SDAG-NEXT: v_or_b32_e32 v20, v34, v36
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16
+; SDAG-NEXT: v_or_b32_e32 v21, v35, v37
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[6:7], v26
+; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v26
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v27
+; SDAG-NEXT: v_or_b32_e32 v17, v17, v21
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v25, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v24, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v6, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v22, 0
; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB3_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
-; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v34
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[4:5], v34
; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v34
; SDAG-NEXT: v_subrev_i32_e32 v39, vcc, 64, v34
; SDAG-NEXT: v_lshr_b64 v[26:27], v[6:7], v34
@@ -2747,100 +2748,100 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v28
; SDAG-NEXT: v_lshr_b64 v[49:50], v[6:7], v39
; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc
-; SDAG-NEXT: v_or_b32_e32 v21, v21, v29
-; SDAG-NEXT: v_or_b32_e32 v20, v20, v28
+; SDAG-NEXT: v_or_b32_e32 v19, v19, v29
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v28
; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v50, v21, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v49, v20, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v50, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v49, v18, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34
-; SDAG-NEXT: v_cndmask_b32_e32 v27, v21, v5, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v26, v20, v4, vcc
-; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v5, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v4, vcc
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: .LBB3_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v27
+; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27
; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v19
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v17
+; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v17
; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
-; SDAG-NEXT: v_or_b32_e32 v28, v28, v20
+; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v21
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT: v_or_b32_e32 v18, v28, v18
; SDAG-NEXT: v_or_b32_e32 v26, v26, v50
-; SDAG-NEXT: v_or_b32_e32 v18, v18, v51
-; SDAG-NEXT: v_or_b32_e32 v19, v23, v19
-; SDAG-NEXT: v_or_b32_e32 v17, v25, v17
-; SDAG-NEXT: v_or_b32_e32 v18, v22, v18
-; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v38, v26
-; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v39, v27, vcc
-; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v48, v28, vcc
-; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v49, v29, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v20
-; SDAG-NEXT: v_and_b32_e32 v20, 1, v25
-; SDAG-NEXT: v_and_b32_e32 v50, v25, v15
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v51
+; SDAG-NEXT: v_or_b32_e32 v17, v23, v17
+; SDAG-NEXT: v_or_b32_e32 v21, v25, v21
+; SDAG-NEXT: v_sub_i32_e32 v25, vcc, v38, v26
+; SDAG-NEXT: v_or_b32_e32 v16, v22, v16
+; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v39, v27, vcc
+; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v48, v18, vcc
+; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v49, v29, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v25
+; SDAG-NEXT: v_and_b32_e32 v28, v25, v12
+; SDAG-NEXT: v_and_b32_e32 v50, v25, v13
; SDAG-NEXT: v_and_b32_e32 v51, v25, v14
-; SDAG-NEXT: v_and_b32_e32 v52, v25, v13
-; SDAG-NEXT: v_and_b32_e32 v25, v25, v12
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v25
-; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc
-; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v51, vcc
-; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v50, vcc
+; SDAG-NEXT: v_and_b32_e32 v52, v25, v15
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v28
+; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v50, vcc
+; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v18, v51, vcc
+; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v52, vcc
; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v34
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc
; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc
-; SDAG-NEXT: v_or_b32_e32 v51, v35, v37
; SDAG-NEXT: v_or_b32_e32 v50, v34, v36
+; SDAG-NEXT: v_or_b32_e32 v51, v35, v37
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51]
+; SDAG-NEXT: v_and_b32_e32 v18, 1, v25
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v16, v24, v16
-; SDAG-NEXT: v_mov_b32_e32 v25, v21
-; SDAG-NEXT: v_mov_b32_e32 v24, v20
+; SDAG-NEXT: v_or_b32_e32 v20, v24, v20
+; SDAG-NEXT: v_mov_b32_e32 v25, v19
+; SDAG-NEXT: v_mov_b32_e32 v24, v18
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB3_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB3_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v17
; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
-; SDAG-NEXT: v_or_b32_e32 v18, v18, v24
-; SDAG-NEXT: v_or_b32_e32 v23, v23, v19
-; SDAG-NEXT: v_or_b32_e32 v21, v21, v17
-; SDAG-NEXT: v_or_b32_e32 v22, v22, v18
-; SDAG-NEXT: v_or_b32_e32 v20, v20, v16
+; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v21
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v24
+; SDAG-NEXT: v_or_b32_e32 v23, v23, v17
+; SDAG-NEXT: v_or_b32_e32 v19, v19, v21
+; SDAG-NEXT: v_or_b32_e32 v22, v22, v16
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v20
; SDAG-NEXT: .LBB3_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT: v_mul_lo_u32 v18, v32, v11
+; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11
; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0
; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10
; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8
; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9
; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
-; SDAG-NEXT: v_mul_lo_u32 v34, v20, v15
-; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v20, v14, 0
-; SDAG-NEXT: v_mul_lo_u32 v35, v21, v14
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_mul_lo_u32 v34, v18, v15
+; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v18, v14, 0
+; SDAG-NEXT: v_mul_lo_u32 v35, v19, v14
; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12
; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13
-; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v20, 0
-; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v18
-; SDAG-NEXT: v_mov_b32_e32 v18, v11
-; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[18:19]
+; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v18, 0
+; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v20
+; SDAG-NEXT: v_mov_b32_e32 v20, v11
+; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21]
; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
-; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v25, v34
+; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v25, v34
; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v17, v28
; SDAG-NEXT: v_mov_b32_e32 v28, v27
-; SDAG-NEXT: v_mov_b32_e32 v27, v19
+; SDAG-NEXT: v_mov_b32_e32 v27, v21
; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[26:27]
-; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v18, v35
-; SDAG-NEXT: v_mov_b32_e32 v18, v15
-; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v13, v20, v[18:19]
+; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v20, v35
+; SDAG-NEXT: v_mov_b32_e32 v20, v15
+; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v13, v18, v[20:21]
; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v31, v8, v[16:17]
; SDAG-NEXT: v_mov_b32_e32 v8, v11
; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v28, v8
@@ -2849,24 +2850,24 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25]
; SDAG-NEXT: v_mov_b32_e32 v22, v27
-; SDAG-NEXT: v_mov_b32_e32 v27, v19
-; SDAG-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v12, v21, v[26:27]
+; SDAG-NEXT: v_mov_b32_e32 v27, v21
+; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v19, v[26:27]
; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v29, v16
; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[17:18]
; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v23, v11
-; SDAG-NEXT: v_mov_b32_e32 v11, v20
+; SDAG-NEXT: v_mov_b32_e32 v11, v21
; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v22, v11
; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v33, v16
; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v36, v17
-; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v21, v[11:12]
+; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v19, v[11:12]
; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15
; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v16, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc
; SDAG-NEXT: v_add_i32_e32 v8, vcc, v11, v10
; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v12, v17, vcc
-; SDAG-NEXT: v_mov_b32_e32 v10, v19
+; SDAG-NEXT: v_mov_b32_e32 v10, v20
; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v10, vcc
; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc
@@ -2876,7 +2877,6 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-LABEL: v_urem_v2i128_vv:
; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_or_b32_e32 v16, v8, v10
; GISEL-NEXT: v_or_b32_e32 v17, v9, v11
; GISEL-NEXT: v_or_b32_e32 v18, v0, v2
@@ -2887,20 +2887,21 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_ffbh_u32_e32 v25, v10
; GISEL-NEXT: v_ffbh_u32_e32 v26, v1
; GISEL-NEXT: v_ffbh_u32_e32 v27, v0
-; GISEL-NEXT: v_ffbh_u32_e32 v28, v3
-; GISEL-NEXT: v_ffbh_u32_e32 v29, v2
+; GISEL-NEXT: v_ffbh_u32_e32 v28, v2
+; GISEL-NEXT: v_ffbh_u32_e32 v29, v3
; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f
; GISEL-NEXT: v_mov_b32_e32 v21, 0
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23
; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v25
; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27
-; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29
+; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v28
; GISEL-NEXT: v_min_u32_e32 v16, v22, v16
; GISEL-NEXT: v_min_u32_e32 v17, v24, v17
; GISEL-NEXT: v_min_u32_e32 v18, v26, v18
-; GISEL-NEXT: v_min_u32_e32 v19, v28, v19
+; GISEL-NEXT: v_min_u32_e32 v19, v29, v19
; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16
@@ -2913,28 +2914,28 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_xor_b32_e32 v23, 0x7f, v18
; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[18:19], v[20:21]
-; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
-; GISEL-NEXT: v_xor_b32_e32 v20, 0x7f, v18
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17]
; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
-; GISEL-NEXT: v_or_b32_e32 v20, v20, v16
+; GISEL-NEXT: v_or_b32_e32 v20, v23, v16
; GISEL-NEXT: v_or_b32_e32 v21, v19, v17
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; GISEL-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v21, v22, v23
-; GISEL-NEXT: v_and_b32_e32 v22, 1, v21
; GISEL-NEXT: v_or_b32_e32 v20, v21, v20
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
+; GISEL-NEXT: v_and_b32_e32 v21, 1, v21
+; GISEL-NEXT: v_and_b32_e32 v20, 1, v20
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21
; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v22, 1, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v20
; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
+; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB3_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -2987,11 +2988,11 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30
; GISEL-NEXT: v_cndmask_b32_e32 v18, v26, v18, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v28, 0, v16, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v29, 0, v17, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v17, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
-; GISEL-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v28, v18, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v29, v19, v1, vcc
; GISEL-NEXT: v_mov_b32_e32 v25, 0
; GISEL-NEXT: v_mov_b32_e32 v19, s7
; GISEL-NEXT: v_mov_b32_e32 v18, s6
@@ -2999,40 +3000,40 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v16, s4
; GISEL-NEXT: .LBB3_3: ; %udiv-do-while3
; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshrrev_b32_e32 v38, 31, v23
; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v23
-; GISEL-NEXT: v_lshl_b64 v[38:39], v[26:27], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v29
; GISEL-NEXT: v_lshl_b64 v[28:29], v[28:29], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v27
-; GISEL-NEXT: v_lshrrev_b32_e32 v27, 31, v21
-; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21
; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v30
; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
; GISEL-NEXT: v_or_b32_e32 v22, v16, v18
; GISEL-NEXT: v_or_b32_e32 v23, v17, v19
-; GISEL-NEXT: v_or_b32_e32 v18, v28, v26
-; GISEL-NEXT: v_or_b32_e32 v19, v38, v27
-; GISEL-NEXT: v_or_b32_e32 v20, v20, v24
+; GISEL-NEXT: v_or_b32_e32 v18, v26, v24
+; GISEL-NEXT: v_or_b32_e32 v19, v28, v39
; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v34, v19
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v39, vcc
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v29, vcc
; GISEL-NEXT: v_or_b32_e32 v16, v30, v32
; GISEL-NEXT: v_or_b32_e32 v17, v31, v33
; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v36, v18, vcc
-; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v37, v29, vcc
+; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v37, v27, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v24
; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GISEL-NEXT: v_and_b32_e32 v24, 1, v16
; GISEL-NEXT: v_and_b32_e32 v17, v16, v8
-; GISEL-NEXT: v_and_b32_e32 v27, v16, v9
-; GISEL-NEXT: v_and_b32_e32 v28, v16, v10
-; GISEL-NEXT: v_and_b32_e32 v16, v16, v11
-; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v19, v17
-; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v39, v27, vcc
-; GISEL-NEXT: v_subb_u32_e32 v28, vcc, v18, v28, vcc
-; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v16, vcc
+; GISEL-NEXT: v_and_b32_e32 v26, v16, v9
+; GISEL-NEXT: v_and_b32_e32 v39, v16, v10
+; GISEL-NEXT: v_and_b32_e32 v48, v16, v11
+; GISEL-NEXT: v_and_b32_e32 v24, 1, v16
+; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17
+; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v26, vcc
+; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v18, v39, vcc
+; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc
+; GISEL-NEXT: v_or_b32_e32 v20, v20, v38
; GISEL-NEXT: v_mov_b32_e32 v16, v24
; GISEL-NEXT: v_mov_b32_e32 v17, v25
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index c3c1540383ec63..a4425666765618 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -694,18 +694,14 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1)
; SI-NEXT: s_load_dword s8, s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
-; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0
-; SI-NEXT: v_mov_b32_e32 v6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; SI-NEXT: v_mov_b32_e32 v9, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[4:7], 0 addr64
-; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v0
-; SI-NEXT: v_mov_b32_e32 v10, v6
-; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[4:7], 0 addr64 offset:16
-; SI-NEXT: s_cmp_eq_u32 s8, 1
+; SI-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v11, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -721,61 +717,64 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1)
; SI-NEXT: v_cvt_f32_f16_e32 v16, v7
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: v_cvt_f32_f16_e32 v17, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8
+; SI-NEXT: v_lshlrev_b32_e32 v8, 1, v0
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_cmp_eq_u32 s8, 1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v18
+; SI-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 3
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 4
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 5
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 6
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 7
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 8
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 9
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 10
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 11
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 12
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 13
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 14
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_eq_u32 s8, 15
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: buffer_store_short v0, v[9:10], s[0:3], 0 addr64
+; SI-NEXT: buffer_store_short v0, v[8:9], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 3199b76d279fab..3c70883f09d2c1 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -3030,50 +3030,50 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
; VI-LABEL: v_test_canonicalize_var_v32f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_f16_sdwa v20, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_max_f16_sdwa v19, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v20
-; VI-NEXT: v_max_f16_sdwa v20, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v0, v0, v19
+; VI-NEXT: v_max_f16_sdwa v19, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_or_b32_e32 v1, v1, v20
-; VI-NEXT: v_max_f16_sdwa v20, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v1, v1, v19
+; VI-NEXT: v_max_f16_sdwa v19, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v2, v2, v2
-; VI-NEXT: v_or_b32_e32 v2, v2, v20
-; VI-NEXT: v_max_f16_sdwa v20, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v2, v2, v19
+; VI-NEXT: v_max_f16_sdwa v19, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v3, v3, v3
-; VI-NEXT: v_or_b32_e32 v3, v3, v20
-; VI-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v3, v3, v19
+; VI-NEXT: v_max_f16_sdwa v19, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v4, v4, v4
-; VI-NEXT: v_or_b32_e32 v4, v4, v20
-; VI-NEXT: v_max_f16_sdwa v20, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v4, v4, v19
+; VI-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v5, v5, v5
-; VI-NEXT: v_or_b32_e32 v5, v5, v20
-; VI-NEXT: v_max_f16_sdwa v20, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v5, v5, v19
+; VI-NEXT: v_max_f16_sdwa v19, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v6, v6, v6
-; VI-NEXT: v_or_b32_e32 v6, v6, v20
-; VI-NEXT: v_max_f16_sdwa v20, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v6, v6, v19
+; VI-NEXT: v_max_f16_sdwa v19, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v7, v7, v7
-; VI-NEXT: v_or_b32_e32 v7, v7, v20
-; VI-NEXT: v_max_f16_sdwa v20, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v7, v7, v19
+; VI-NEXT: v_max_f16_sdwa v19, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v8, v8, v8
-; VI-NEXT: v_or_b32_e32 v8, v8, v20
-; VI-NEXT: v_max_f16_sdwa v20, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v8, v8, v19
+; VI-NEXT: v_max_f16_sdwa v19, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v9, v9, v9
-; VI-NEXT: v_or_b32_e32 v9, v9, v20
-; VI-NEXT: v_max_f16_sdwa v20, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v9, v9, v19
+; VI-NEXT: v_max_f16_sdwa v19, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v10, v10, v10
+; VI-NEXT: v_or_b32_e32 v10, v10, v19
+; VI-NEXT: v_max_f16_sdwa v19, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v11, v11, v11
; VI-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v11, v11, v19
; VI-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v10, v10, v20
-; VI-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v15, v15, v15
; VI-NEXT: v_max_f16_e32 v14, v14, v14
; VI-NEXT: v_max_f16_e32 v13, v13, v13
; VI-NEXT: v_max_f16_e32 v12, v12, v12
-; VI-NEXT: v_max_f16_e32 v11, v11, v11
-; VI-NEXT: v_or_b32_e32 v11, v11, v20
; VI-NEXT: v_or_b32_e32 v12, v12, v19
; VI-NEXT: v_or_b32_e32 v13, v13, v18
; VI-NEXT: v_or_b32_e32 v14, v14, v17
@@ -3342,11 +3342,11 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v64f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
-; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -3358,7 +3358,7 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: v_cvt_f16_f32_e32 v2, v4
; CI-NEXT: v_cvt_f16_f32_e32 v4, v5
; CI-NEXT: v_cvt_f16_f32_e32 v5, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v11
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
@@ -3370,341 +3370,344 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v21
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v17
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v3, v4, v3
; CI-NEXT: v_cvt_f16_f32_e32 v4, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v14
-; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32
-; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v25
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v28
; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_or_b32_e32 v4, v5, v4
-; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_or_b32_e32 v5, v6, v5
; CI-NEXT: v_cvt_f16_f32_e32 v6, v12
-; CI-NEXT: v_or_b32_e32 v5, v7, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v15
+; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:8
; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v21
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: s_waitcnt vmcnt(2)
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; CI-NEXT: v_or_b32_e32 v6, v7, v6
-; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v19
-; CI-NEXT: v_or_b32_e32 v7, v9, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v20
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v8
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v18
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v19
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x7c, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; CI-NEXT: v_or_b32_e32 v7, v8, v7
+; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v18
; CI-NEXT: v_or_b32_e32 v8, v10, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v10
-; CI-NEXT: v_or_b32_e32 v9, v11, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT: v_or_b32_e32 v9, v10, v9
; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v25
; CI-NEXT: v_cvt_f16_f32_e32 v13, v22
-; CI-NEXT: v_or_b32_e32 v10, v12, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v26
-; CI-NEXT: v_cvt_f32_f16_e32 v16, v11
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v13
-; CI-NEXT: v_cvt_f32_f16_e32 v13, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v24
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_or_b32_e32 v10, v14, v10
+; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; CI-NEXT: v_or_b32_e32 v17, v18, v17
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v30
-; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; CI-NEXT: v_or_b32_e32 v11, v13, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v17
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v29
-; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; CI-NEXT: v_or_b32_e32 v12, v15, v12
-; CI-NEXT: s_waitcnt vmcnt(6)
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v31
-; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v17
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132
-; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120
; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT: v_cvt_f32_f16_e32 v23, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v27
-; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
-; CI-NEXT: s_waitcnt vmcnt(7)
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: s_waitcnt vmcnt(6)
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v33
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT: v_cvt_f32_f16_e32 v24, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v28
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: v_cvt_f16_f32_e32 v28, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; CI-NEXT: v_or_b32_e32 v13, v16, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v32
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12
-; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; CI-NEXT: v_or_b32_e32 v14, v15, v14
-; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v22
-; CI-NEXT: v_or_b32_e32 v15, v25, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v21
-; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96
-; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100
-; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64
-; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v16
-; CI-NEXT: v_or_b32_e32 v16, v24, v25
-; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v27
-; CI-NEXT: v_or_b32_e32 v25, v28, v24
-; CI-NEXT: s_waitcnt vmcnt(9)
-; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
-; CI-NEXT: s_waitcnt vmcnt(8)
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT: s_waitcnt vmcnt(7)
-; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
-; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
-; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; CI-NEXT: v_or_b32_e32 v20, v19, v20
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20
-; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8
-; CI-NEXT: s_waitcnt vmcnt(8)
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; CI-NEXT: s_waitcnt vmcnt(7)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_or_b32_e32 v19, v20, v19
+; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v30
+; CI-NEXT: v_or_b32_e32 v20, v22, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v29
; CI-NEXT: s_waitcnt vmcnt(6)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v34
-; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
-; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v17, v17, v26
-; CI-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0
-; CI-NEXT: v_or_b32_e32 v18, v27, v18
-; CI-NEXT: buffer_store_dword v17, v26, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0
-; CI-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 0x74, v0
-; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 0x70, v0
-; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen
-; CI-NEXT: s_waitcnt vmcnt(8)
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT: s_waitcnt vmcnt(7)
-; CI-NEXT: v_cvt_f16_f32_e32 v20, v22
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88
-; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84
-; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; CI-NEXT: s_waitcnt vmcnt(12)
-; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
-; CI-NEXT: v_cvt_f32_f16_e32 v29, v29
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; CI-NEXT: v_or_b32_e32 v20, v21, v20
-; CI-NEXT: v_add_i32_e32 v21, vcc, 0x6c, v0
-; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen
-; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24
-; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
-; CI-NEXT: s_waitcnt vmcnt(13)
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT: s_waitcnt vmcnt(12)
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v24
-; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:28
-; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16
-; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; CI-NEXT: v_or_b32_e32 v20, v23, v20
-; CI-NEXT: s_waitcnt vmcnt(9)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: s_waitcnt vmcnt(8)
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v28
-; CI-NEXT: s_waitcnt vmcnt(7)
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; CI-NEXT: s_waitcnt vmcnt(6)
-; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
-; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; CI-NEXT: s_waitcnt vmcnt(4)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
-; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT: v_or_b32_e32 v23, v27, v23
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0
-; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
-; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
-; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v17, v17, v18
-; CI-NEXT: v_add_i32_e32 v18, vcc, 0x64, v0
-; CI-NEXT: v_or_b32_e32 v25, v25, v26
-; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0
-; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 0x5c, v0
; CI-NEXT: s_waitcnt vmcnt(5)
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT: v_or_b32_e32 v19, v24, v19
-; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
; CI-NEXT: v_or_b32_e32 v21, v22, v21
-; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40
-; CI-NEXT: s_waitcnt vmcnt(5)
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: s_waitcnt vmcnt(4)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: s_waitcnt vmcnt(2)
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x78, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT: v_cvt_f16_f32_e32 v28, v22
-; CI-NEXT: v_or_b32_e32 v22, v23, v27
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52
-; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v24
-; CI-NEXT: v_or_b32_e32 v23, v28, v23
-; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
-; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x74, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; CI-NEXT: v_or_b32_e32 v24, v24, v27
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x70, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; CI-NEXT: v_or_b32_e32 v27, v28, v27
-; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x6c, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
-; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
-; CI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; CI-NEXT: v_or_b32_e32 v28, v29, v28
-; CI-NEXT: buffer_store_dword v28, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0
-; CI-NEXT: buffer_store_dword v27, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0
-; CI-NEXT: buffer_store_dword v24, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0
-; CI-NEXT: buffer_store_dword v23, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0
-; CI-NEXT: buffer_store_dword v22, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0
-; CI-NEXT: buffer_store_dword v21, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 0x44, v0
-; CI-NEXT: buffer_store_dword v19, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 64, v0
-; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v17, vcc, 60, v0
-; CI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v16, vcc, 56, v0
-; CI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v15, vcc, 52, v0
-; CI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v14, vcc, 48, v0
-; CI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v13, vcc, 44, v0
-; CI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v12, vcc, 40, v0
-; CI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x64, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x5c, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x58, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x54, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x4c, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; CI-NEXT: v_or_b32_e32 v31, v32, v31
+; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0
+; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; CI-NEXT: v_or_b32_e32 v14, v15, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; CI-NEXT: v_or_b32_e32 v12, v12, v15
+; CI-NEXT: v_or_b32_e32 v11, v16, v11
+; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0
+; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0
+; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v11, vcc, 60, v0
+; CI-NEXT: buffer_store_dword v14, v11, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v11, vcc, 56, v0
+; CI-NEXT: buffer_store_dword v21, v11, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v11, vcc, 52, v0
+; CI-NEXT: buffer_store_dword v20, v11, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v11, vcc, 48, v0
+; CI-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v11, vcc, 44, v0
+; CI-NEXT: buffer_store_dword v17, v11, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v11, vcc, 40, v0
+; CI-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen
; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0
; CI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
; CI-NEXT: v_add_i32_e32 v10, vcc, 32, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index a0fe9d88e31cf9..3a7f3e41002d28 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -172,52 +172,52 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7]
; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
; GISEL-NEXT: v_or_b32_e32 v1, v1, v2
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v0, v2
; GISEL-NEXT: v_or_b32_e32 v1, v1, v3
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v3
; GISEL-NEXT: v_or_b32_e32 v1, v1, v8
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v8
; GISEL-NEXT: v_or_b32_e32 v1, v1, v9
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v9
; GISEL-NEXT: v_or_b32_e32 v1, v1, v10
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v10
; GISEL-NEXT: v_or_b32_e32 v1, v1, v11
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v11
; GISEL-NEXT: v_or_b32_e32 v1, v1, v12
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v12
; GISEL-NEXT: v_or_b32_e32 v1, v1, v13
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v13
; GISEL-NEXT: v_or_b32_e32 v1, v1, v14
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v14
; GISEL-NEXT: v_or_b32_e32 v1, v1, v15
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v15
; GISEL-NEXT: v_or_b32_e32 v1, v1, v16
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v16
; GISEL-NEXT: v_or_b32_e32 v1, v1, v17
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v17
; GISEL-NEXT: v_or_b32_e32 v1, v1, v18
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v18
; GISEL-NEXT: v_or_b32_e32 v1, v1, v19
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v20
-; GISEL-NEXT: v_or_b32_e32 v1, v1, v20
+; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v19
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v0
+; GISEL-NEXT: v_or_b32_e32 v0, v2, v0
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0
@@ -331,34 +331,34 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16
; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18
; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1
; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18
-; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20
-; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
-; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
-; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
-; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
-; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
-; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
-; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
-; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
-; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3
+; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5
+; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7
+; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9
+; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11
+; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13
+; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
-; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1
-; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13
+; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1
+; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1
; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1
; GISEL-NEXT: v_mov_b32_e32 v2, v1
; GISEL-NEXT: .LBB0_9: ; %Flow3
@@ -540,52 +540,52 @@ define i128 @fptoui_f64_to_i128(double %x) {
; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7]
; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
; GISEL-NEXT: v_or_b32_e32 v1, v1, v2
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v0, v2
; GISEL-NEXT: v_or_b32_e32 v1, v1, v3
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v3
; GISEL-NEXT: v_or_b32_e32 v1, v1, v8
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v8
; GISEL-NEXT: v_or_b32_e32 v1, v1, v9
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v9
; GISEL-NEXT: v_or_b32_e32 v1, v1, v10
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v10
; GISEL-NEXT: v_or_b32_e32 v1, v1, v11
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v11
; GISEL-NEXT: v_or_b32_e32 v1, v1, v12
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v12
; GISEL-NEXT: v_or_b32_e32 v1, v1, v13
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v13
; GISEL-NEXT: v_or_b32_e32 v1, v1, v14
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v14
; GISEL-NEXT: v_or_b32_e32 v1, v1, v15
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v15
; GISEL-NEXT: v_or_b32_e32 v1, v1, v16
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v16
; GISEL-NEXT: v_or_b32_e32 v1, v1, v17
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v17
; GISEL-NEXT: v_or_b32_e32 v1, v1, v18
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v18
; GISEL-NEXT: v_or_b32_e32 v1, v1, v19
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v20
-; GISEL-NEXT: v_or_b32_e32 v1, v1, v20
+; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v19
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v0
+; GISEL-NEXT: v_or_b32_e32 v0, v2, v0
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0
@@ -699,34 +699,34 @@ define i128 @fptoui_f64_to_i128(double %x) {
; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16
; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18
; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1
; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18
-; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20
-; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
-; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
-; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
-; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
-; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
-; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
-; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
-; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
-; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3
+; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5
+; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7
+; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9
+; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11
+; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13
+; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
-; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1
-; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13
+; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1
+; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1
; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1
; GISEL-NEXT: v_mov_b32_e32 v2, v1
; GISEL-NEXT: .LBB1_9: ; %Flow3
@@ -900,52 +900,52 @@ define i128 @fptosi_f32_to_i128(float %x) {
; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7]
; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
; GISEL-NEXT: v_or_b32_e32 v1, v1, v2
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v0, v2
; GISEL-NEXT: v_or_b32_e32 v1, v1, v3
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v5
+; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v3
; GISEL-NEXT: v_or_b32_e32 v1, v1, v5
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v5
; GISEL-NEXT: v_or_b32_e32 v1, v1, v8
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v8
; GISEL-NEXT: v_or_b32_e32 v1, v1, v9
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v9
; GISEL-NEXT: v_or_b32_e32 v1, v1, v10
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v10
; GISEL-NEXT: v_or_b32_e32 v1, v1, v11
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v11
; GISEL-NEXT: v_or_b32_e32 v1, v1, v12
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v12
; GISEL-NEXT: v_or_b32_e32 v1, v1, v13
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v13
; GISEL-NEXT: v_or_b32_e32 v1, v1, v14
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v14
; GISEL-NEXT: v_or_b32_e32 v1, v1, v15
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v15
; GISEL-NEXT: v_or_b32_e32 v1, v1, v16
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v16
; GISEL-NEXT: v_or_b32_e32 v1, v1, v17
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v17
; GISEL-NEXT: v_or_b32_e32 v1, v1, v18
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v19
-; GISEL-NEXT: v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v18
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v0
+; GISEL-NEXT: v_or_b32_e32 v0, v2, v0
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0
@@ -1054,34 +1054,34 @@ define i128 @fptosi_f32_to_i128(float %x) {
; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16
; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18
; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1
; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18
-; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20
-; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
-; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
-; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
-; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
-; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
-; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
-; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
-; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
-; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3
+; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5
+; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7
+; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9
+; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11
+; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13
+; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
-; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1
-; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13
+; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1
+; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1
; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1
; GISEL-NEXT: v_mov_b32_e32 v2, v1
; GISEL-NEXT: .LBB2_9: ; %Flow3
@@ -1255,52 +1255,52 @@ define i128 @fptoui_f32_to_i128(float %x) {
; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7]
; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
; GISEL-NEXT: v_or_b32_e32 v1, v1, v2
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v0, v2
; GISEL-NEXT: v_or_b32_e32 v1, v1, v3
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v5
+; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v3
; GISEL-NEXT: v_or_b32_e32 v1, v1, v5
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v5
; GISEL-NEXT: v_or_b32_e32 v1, v1, v8
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v8
; GISEL-NEXT: v_or_b32_e32 v1, v1, v9
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v9
; GISEL-NEXT: v_or_b32_e32 v1, v1, v10
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v10
; GISEL-NEXT: v_or_b32_e32 v1, v1, v11
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v11
; GISEL-NEXT: v_or_b32_e32 v1, v1, v12
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v12
; GISEL-NEXT: v_or_b32_e32 v1, v1, v13
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v13
; GISEL-NEXT: v_or_b32_e32 v1, v1, v14
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v14
; GISEL-NEXT: v_or_b32_e32 v1, v1, v15
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v15
; GISEL-NEXT: v_or_b32_e32 v1, v1, v16
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v16
; GISEL-NEXT: v_or_b32_e32 v1, v1, v17
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v17
; GISEL-NEXT: v_or_b32_e32 v1, v1, v18
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v19
-; GISEL-NEXT: v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v18
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v0
+; GISEL-NEXT: v_or_b32_e32 v0, v2, v0
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0
@@ -1409,34 +1409,34 @@ define i128 @fptoui_f32_to_i128(float %x) {
; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16
; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18
; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1
; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18
-; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20
-; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
-; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
-; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
-; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
-; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
-; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
-; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
-; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
-; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3
+; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5
+; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7
+; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9
+; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11
+; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13
+; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
-; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1
-; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13
+; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1
+; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1
; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1
; GISEL-NEXT: v_mov_b32_e32 v2, v1
; GISEL-NEXT: .LBB3_9: ; %Flow3
@@ -1786,34 +1786,34 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16
; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18
; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1
; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18
-; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20
-; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
-; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
-; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
-; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
-; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
-; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
-; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
-; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
-; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3
+; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5
+; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7
+; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9
+; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11
+; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13
+; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
-; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1
-; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13
+; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1
+; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1
; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1
; GISEL-NEXT: v_mov_b32_e32 v2, v1
; GISEL-NEXT: .LBB6_9: ; %Flow3
@@ -2135,34 +2135,34 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16
; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18
; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1
; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18
-; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20
-; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
-; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
-; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
-; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
-; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
-; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
-; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
-; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
-; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3
+; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5
+; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7
+; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9
+; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11
+; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13
+; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
-; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1
-; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13
+; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1
+; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1
; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1
; GISEL-NEXT: v_mov_b32_e32 v2, v1
; GISEL-NEXT: .LBB7_9: ; %Flow3
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 4f3086a9eb1f9a..34ee90c68569fd 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1209,50 +1209,50 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6
-; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7]
+; SDAG-NEXT: v_lshlrev_b32_e32 v10, 8, v10
+; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
; SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3]
-; SDAG-NEXT: v_rsq_f64_e32 v[10:11], v[4:5]
-; SDAG-NEXT: v_mul_f64 v[12:13], v[0:1], v[6:7]
+; SDAG-NEXT: v_rsq_f64_e32 v[12:13], v[4:5]
+; SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7]
; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
; SDAG-NEXT: v_mul_f64 v[14:15], v[2:3], v[8:9]
; SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5
-; SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[10:11]
-; SDAG-NEXT: v_mul_f64 v[10:11], v[10:11], 0.5
-; SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[12:13], 0.5
-; SDAG-NEXT: v_fma_f64 v[20:21], -v[8:9], v[14:15], 0.5
-; SDAG-NEXT: v_fma_f64 v[22:23], -v[10:11], v[16:17], 0.5
+; SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[10:11], 0.5
+; SDAG-NEXT: v_fma_f64 v[18:19], -v[8:9], v[14:15], 0.5
+; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11]
+; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7]
+; SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[12:13]
+; SDAG-NEXT: v_mul_f64 v[12:13], v[12:13], 0.5
+; SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[18:19], v[8:9]
+; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[16:17], 0.5
+; SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17]
; SDAG-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
-; SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
-; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
-; SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
-; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
-; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[12:13], v[18:19], v[6:7], v[12:13]
-; SDAG-NEXT: v_fma_f64 v[14:15], v[20:21], v[8:9], v[14:15]
-; SDAG-NEXT: v_fma_f64 v[16:17], v[22:23], v[10:11], v[16:17]
-; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[12:13]
-; SDAG-NEXT: v_mov_b32_e32 v12, 0xffffff80
-; SDAG-NEXT: v_mov_b32_e32 v13, 0x260
-; SDAG-NEXT: v_fma_f64 v[8:9], v[20:21], v[8:9], v[14:15]
-; SDAG-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5]
-; SDAG-NEXT: v_fma_f64 v[10:11], v[22:23], v[10:11], v[16:17]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7]
-; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14
-; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13
-; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13
-; SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15
-; SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13
-; SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12
+; SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; SDAG-NEXT: v_fma_f64 v[18:19], -v[14:15], v[14:15], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[14:15], v[18:19], v[8:9], v[14:15]
+; SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[16:17], v[18:19], v[12:13], v[16:17]
+; SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[10:11]
+; SDAG-NEXT: v_fma_f64 v[10:11], -v[14:15], v[14:15], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[14:15]
+; SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[12:13], v[16:17]
+; SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v15, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7]
+; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13
+; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15
+; SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
+; SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15
; SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
@@ -1266,61 +1266,61 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s4, 0
; GISEL-NEXT: s_brev_b32 s5, 8
-; GISEL-NEXT: v_mov_b32_e32 v6, s4
; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; GISEL-NEXT: v_mov_b32_e32 v6, s4
; GISEL-NEXT: v_mov_b32_e32 v7, s5
; GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
; GISEL-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
; GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8
+; GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
-; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[2:3]
-; GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[4:5]
-; GISEL-NEXT: v_mul_f64 v[12:13], v[6:7], 0.5
-; GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7]
-; GISEL-NEXT: v_mul_f64 v[14:15], v[8:9], 0.5
-; GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[8:9]
-; GISEL-NEXT: v_mul_f64 v[16:17], v[10:11], 0.5
-; GISEL-NEXT: v_mul_f64 v[10:11], v[4:5], v[10:11]
-; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[6:7], 0.5
-; GISEL-NEXT: v_fma_f64 v[20:21], -v[14:15], v[8:9], 0.5
-; GISEL-NEXT: v_fma_f64 v[22:23], -v[16:17], v[10:11], 0.5
-; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3]
+; GISEL-NEXT: v_rsq_f64_e32 v[12:13], v[4:5]
+; GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], 0.5
+; GISEL-NEXT: v_mul_f64 v[8:9], v[0:1], v[8:9]
+; GISEL-NEXT: v_mul_f64 v[14:15], v[10:11], 0.5
+; GISEL-NEXT: v_mul_f64 v[10:11], v[2:3], v[10:11]
+; GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[8:9], 0.5
+; GISEL-NEXT: v_fma_f64 v[18:19], -v[14:15], v[10:11], 0.5
+; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9]
+; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7]
+; GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], 0.5
+; GISEL-NEXT: v_mul_f64 v[12:13], v[4:5], v[12:13]
+; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
+; GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; GISEL-NEXT: v_fma_f64 v[18:19], -v[16:17], v[12:13], 0.5
; GISEL-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
-; GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
-; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
-; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
-; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
-; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
-; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
-; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
-; GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v13, 0x260
-; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
-; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc
-; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7]
-; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13
-; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13
-; GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15
-; GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13
-; GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12
+; GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17]
+; GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[8:9], v[18:19], v[6:7], v[8:9]
+; GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[14:15], v[10:11]
+; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[12:13], v[18:19], v[16:17], v[12:13]
+; GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[8:9]
+; GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[10:11], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[10:11]
+; GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[16:17], v[12:13]
+; GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v15, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7]
+; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
+; GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13
+; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15
+; GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
+; GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15
; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 3b2f15c8340a63..78e521aba120e9 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -671,17 +671,17 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: s_waitcnt vmcnt(6)
; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(5)
-; CI-NEXT: buffer_store_dword v16, off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: buffer_store_dword v20, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -692,17 +692,17 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -713,19 +713,19 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: buffer_store_dword v16, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1388,137 +1388,137 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; CI-LABEL: void_func_v32i8:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; CI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_or_b32_e32 v2, v3, v2
-; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32
; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; CI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; CI-NEXT: v_and_b32_e32 v6, 0xff, v6
; CI-NEXT: v_or_b32_e32 v4, v4, v5
-; CI-NEXT: v_lshlrev_b32_e32 v5, 24, v7
-; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32
+; CI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; CI-NEXT: v_and_b32_e32 v8, 0xff, v8
; CI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
; CI-NEXT: v_and_b32_e32 v12, 0xff, v12
-; CI-NEXT: v_or_b32_e32 v5, v5, v6
-; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; CI-NEXT: v_or_b32_e32 v8, v8, v9
; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; CI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; CI-NEXT: v_or_b32_e32 v12, v12, v13
-; CI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; CI-NEXT: v_and_b32_e32 v8, 0xff, v8
-; CI-NEXT: v_and_b32_e32 v13, 0xff, v14
+; CI-NEXT: v_and_b32_e32 v9, 0xff, v14
; CI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; CI-NEXT: v_or_b32_e32 v7, v4, v5
+; CI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; CI-NEXT: v_or_b32_e32 v12, v12, v13
; CI-NEXT: v_or_b32_e32 v0, v0, v1
-; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v29
-; CI-NEXT: v_and_b32_e32 v4, 0xff, v28
-; CI-NEXT: v_and_b32_e32 v6, 0xff, v26
-; CI-NEXT: v_or_b32_e32 v8, v8, v9
-; CI-NEXT: v_lshlrev_b32_e32 v9, 24, v15
-; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; CI-NEXT: v_lshlrev_b32_e32 v1, 24, v15
; CI-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; CI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; CI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; CI-NEXT: v_or_b32_e32 v1, v4, v1
-; CI-NEXT: v_and_b32_e32 v4, 0xff, v30
-; CI-NEXT: v_lshlrev_b32_e32 v5, 24, v27
; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; CI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; CI-NEXT: v_lshlrev_b32_e32 v13, 8, v29
+; CI-NEXT: v_and_b32_e32 v14, 0xff, v28
+; CI-NEXT: v_and_b32_e32 v26, 0xff, v26
+; CI-NEXT: v_lshlrev_b32_e32 v25, 8, v25
+; CI-NEXT: v_and_b32_e32 v24, 0xff, v24
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_or_b32_e32 v1, v1, v9
+; CI-NEXT: v_or_b32_e32 v9, v11, v10
+; CI-NEXT: v_and_b32_e32 v10, 0xffff, v12
+; CI-NEXT: v_or_b32_e32 v6, v7, v6
+; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; CI-NEXT: v_lshlrev_b32_e32 v15, 24, v27
+; CI-NEXT: v_and_b32_e32 v27, 0xff, v30
+; CI-NEXT: v_or_b32_e32 v13, v14, v13
+; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v26
+; CI-NEXT: v_or_b32_e32 v7, v3, v2
+; CI-NEXT: v_or_b32_e32 v3, v10, v1
+; CI-NEXT: v_or_b32_e32 v1, v4, v6
+; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v27
+; CI-NEXT: v_or_b32_e32 v11, v15, v14
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CI-NEXT: v_or_b32_e32 v9, v9, v13
-; CI-NEXT: v_or_b32_e32 v10, v11, v10
-; CI-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; CI-NEXT: v_and_b32_e32 v12, 0xffff, v13
; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; CI-NEXT: v_or_b32_e32 v5, v5, v6
-; CI-NEXT: v_or_b32_e32 v6, v0, v2
-; CI-NEXT: v_or_b32_e32 v9, v11, v9
-; CI-NEXT: v_or_b32_e32 v8, v8, v10
-; CI-NEXT: v_lshlrev_b32_e32 v10, 8, v25
-; CI-NEXT: v_and_b32_e32 v11, 0xff, v24
-; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; CI-NEXT: v_or_b32_e32 v0, v0, v7
+; CI-NEXT: v_or_b32_e32 v2, v8, v9
+; CI-NEXT: v_and_b32_e32 v8, 0xff, v20
+; CI-NEXT: v_and_b32_e32 v9, 0xff, v16
; CI-NEXT: s_mov_b32 s5, 0
; CI-NEXT: s_mov_b32 s4, 16
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v0, 24, v3
-; CI-NEXT: v_or_b32_e32 v0, v0, v4
-; CI-NEXT: v_or_b32_e32 v3, v1, v0
-; CI-NEXT: v_or_b32_e32 v0, v11, v10
-; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CI-NEXT: v_and_b32_e32 v1, 0xff, v22
-; CI-NEXT: v_or_b32_e32 v2, v0, v5
-; CI-NEXT: v_lshlrev_b32_e32 v0, 24, v23
-; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT: v_or_b32_e32 v0, v0, v1
-; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
-; CI-NEXT: v_and_b32_e32 v4, 0xff, v20
-; CI-NEXT: v_or_b32_e32 v1, v4, v1
-; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CI-NEXT: v_and_b32_e32 v4, 0xff, v18
-; CI-NEXT: v_or_b32_e32 v1, v1, v0
-; CI-NEXT: v_lshlrev_b32_e32 v0, 24, v19
-; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; CI-NEXT: v_or_b32_e32 v0, v0, v4
-; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v17
-; CI-NEXT: v_and_b32_e32 v5, 0xff, v16
-; CI-NEXT: v_or_b32_e32 v4, v5, v4
-; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; CI-NEXT: v_or_b32_e32 v0, v4, v0
-; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v5
+; CI-NEXT: v_or_b32_e32 v5, v24, v25
+; CI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; CI-NEXT: v_or_b32_e32 v4, v4, v26
+; CI-NEXT: v_or_b32_e32 v6, v5, v11
+; CI-NEXT: v_and_b32_e32 v5, 0xff, v22
+; CI-NEXT: v_or_b32_e32 v7, v12, v4
+; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v23
+; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_or_b32_e32 v4, v4, v5
+; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v21
+; CI-NEXT: v_or_b32_e32 v5, v8, v5
+; CI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; CI-NEXT: v_and_b32_e32 v8, 0xff, v18
+; CI-NEXT: v_or_b32_e32 v5, v5, v4
+; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v19
+; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; CI-NEXT: v_or_b32_e32 v4, v4, v8
+; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v17
+; CI-NEXT: v_or_b32_e32 v8, v9, v8
+; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; CI-NEXT: v_or_b32_e32 v4, v8, v4
+; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; CI-NEXT: s_mov_b32 s4, s5
-; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0
+; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: void_func_v32i8:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v9
+; GFX89-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v11
+; GFX89-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: buffer_load_ubyte v10, off, s[0:3], s32
; GFX89-NEXT: v_lshlrev_b16_e32 v13, 8, v13
; GFX89-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_lshlrev_b16_e32 v13, 8, v15
-; GFX89-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: buffer_load_ubyte v14, off, s[0:3], s32
; GFX89-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; GFX89-NEXT: v_lshlrev_b16_e32 v7, 8, v7
+; GFX89-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX89-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX89-NEXT: v_lshlrev_b16_e32 v11, 8, v29
+; GFX89-NEXT: v_lshlrev_b16_e32 v14, 8, v25
+; GFX89-NEXT: v_lshlrev_b16_e32 v15, 8, v27
+; GFX89-NEXT: v_lshlrev_b16_e32 v21, 8, v21
+; GFX89-NEXT: v_lshlrev_b16_e32 v23, 8, v23
+; GFX89-NEXT: v_lshlrev_b16_e32 v17, 8, v17
+; GFX89-NEXT: v_lshlrev_b16_e32 v19, 8, v19
; GFX89-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_lshlrev_b16_e32 v5, 8, v7
-; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v3
-; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v9
-; GFX89-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; GFX89-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v11
-; GFX89-NEXT: v_or_b32_sdwa v7, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v25
-; GFX89-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v10, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v27
-; GFX89-NEXT: v_lshlrev_b16_e32 v2, 8, v23
-; GFX89-NEXT: v_or_b32_sdwa v11, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v21
-; GFX89-NEXT: v_lshlrev_b16_e32 v3, 8, v17
-; GFX89-NEXT: v_lshlrev_b16_e32 v15, 8, v19
-; GFX89-NEXT: v_or_b32_sdwa v19, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v6, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v17, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v16, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v11, v24, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v14, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v15, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v20, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: s_mov_b32 s5, 0
; GFX89-NEXT: s_mov_b32 s4, 16
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v6, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v5, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v4, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v6, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v5, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: v_lshlrev_b16_e32 v8, 8, v14
+; GFX89-NEXT: v_lshlrev_b16_e32 v8, 8, v10
; GFX89-NEXT: v_or_b32_sdwa v8, v30, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
@@ -2622,102 +2622,37 @@ define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, pt
}
define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 {
-; CI-LABEL: void_func_v32i32_i32_i64:
-; CI: ; %bb.0:
-; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dword v20, off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: void_func_v32i32_i32_i64:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: void_func_v32i32_i32_i64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; CIGFX89-LABEL: void_func_v32i32_i32_i64:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: s_waitcnt vmcnt(3)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dword v34, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx2 v[32:33], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_v32i32_i32_i64:
; GFX11: ; %bb.0:
@@ -2765,129 +2700,86 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; CI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8
+; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: s_waitcnt vmcnt(5)
; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_mul_f32_e32 v12, 1.0, v32
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v33
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; CI-NEXT: v_and_b32_e32 v0, 1, v17
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v20
+; CI-NEXT: v_and_b32_e32 v0, 1, v34
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v12
; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v36, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v16, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v13, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_short v1, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
-; VI-LABEL: void_func_v32i32_i1_i8_i16_bf16:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16
-; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, 1, v20
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_short v17, off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_short v18, off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_short v19, off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: void_func_v32i32_i1_i8_i16_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16
-; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:20
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v20
-; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_short v18, off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_short v19, off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX89-LABEL: void_func_v32i32_i1_i8_i16_bf16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX89-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:4
+; GFX89-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8
+; GFX89-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12
+; GFX89-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16
+; GFX89-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:20
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt vmcnt(5)
+; GFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: v_and_b32_e32 v0, 1, v32
+; GFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_byte v33, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_short v34, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_short v35, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_short v36, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
; GFX11: ; %bb.0:
@@ -2945,105 +2837,38 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
}
define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
-; CI-LABEL: void_func_v32i32_v2i32_v2f32:
-; CI: ; %bb.0:
-; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: void_func_v32i32_v2i32_v2f32:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: void_func_v32i32_v2i32_v2f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; CIGFX89-LABEL: void_func_v32i32_v2i32_v2f32:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CIGFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
+; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: s_waitcnt vmcnt(4)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx2 v[32:33], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx2 v[34:35], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_v32i32_v2i32_v2f32:
; GFX11: ; %bb.0:
@@ -3093,54 +2918,54 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
+; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36
+; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:40
+; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20
+; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
+; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12
+; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20
+; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24
-; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4
; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v38
+; CI-NEXT: v_mul_f32_e32 v4, 1.0, v32
+; CI-NEXT: v_mul_f32_e32 v5, 1.0, v33
+; CI-NEXT: v_mul_f32_e32 v6, 1.0, v34
+; CI-NEXT: v_mul_f32_e32 v7, 1.0, v35
+; CI-NEXT: v_mul_f32_e32 v8, 1.0, v36
+; CI-NEXT: v_mul_f32_e32 v9, 1.0, v37
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v15, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v16, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v8, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v17, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT: v_mul_f32_e32 v9, 1.0, v20
-; CI-NEXT: v_mul_f32_e32 v10, 1.0, v16
-; CI-NEXT: v_mul_f32_e32 v11, 1.0, v17
-; CI-NEXT: v_mul_f32_e32 v16, 1.0, v18
-; CI-NEXT: v_mul_f32_e32 v17, 1.0, v19
-; CI-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v9
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v11
-; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v16
-; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v17
-; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
-; CI-NEXT: buffer_store_short v14, off, s[4:7], 0
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v20
+; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v7
+; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v8
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; CI-NEXT: buffer_store_short v11, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v13, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v10, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_short v5, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -3156,82 +2981,43 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
-; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16
-; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v19, off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v18, off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v19, off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX89-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4
+; GFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8
+; GFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt vmcnt(5)
+; GFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dword v34, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dword v35, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dword v36, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: buffer_store_dwordx2 v[32:33], off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
; GFX11: ; %bb.0:
@@ -3277,284 +3063,132 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
; GFX11-NEXT: s_setpc_b64 s[30:31]
store volatile <32 x i32> %arg0, ptr addrspace(1) undef
store volatile <2 x i16> %arg1, ptr addrspace(1) undef
- store volatile <2 x half> %arg2, ptr addrspace(1) undef
- store volatile <2 x bfloat> %arg3, ptr addrspace(1) undef
- store volatile <4 x bfloat> %arg4, ptr addrspace(1) undef
- ret void
-}
-
-define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
-; CI-LABEL: void_func_v32i32_v2i64_v2f64:
-; CI: ; %bb.0:
-; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
-; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
-; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
-; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
-; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: void_func_v32i32_v2i64_v2f64:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: void_func_v32i32_v2i64_v2f64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: void_func_v32i32_v2i64_v2f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt vmcnt(8)
-; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- store volatile <32 x i32> %arg0, ptr addrspace(1) undef
- store volatile <2 x i64> %arg1, ptr addrspace(1) undef
- store volatile <2 x double> %arg2, ptr addrspace(1) undef
- ret void
-}
-
-define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
-; CI-LABEL: void_func_v32i32_v4i32_v4f32:
-; CI: ; %bb.0:
-; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
-; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
-; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
-; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
-; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: void_func_v32i32_v4i32_v4f32:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: void_func_v32i32_v4i32_v4f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+ store volatile <2 x half> %arg2, ptr addrspace(1) undef
+ store volatile <2 x bfloat> %arg3, ptr addrspace(1) undef
+ store volatile <4 x bfloat> %arg4, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
+; CIGFX89-LABEL: void_func_v32i32_v2i64_v2f64:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; CIGFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20
+; CIGFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
+; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CIGFX89-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
+; CIGFX89-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28
+; CIGFX89-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: s_waitcnt vmcnt(8)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_v2i64_v2f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x8
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+ store volatile <2 x i64> %arg1, ptr addrspace(1) undef
+ store volatile <2 x double> %arg2, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
+; CIGFX89-LABEL: void_func_v32i32_v4i32_v4f32:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; CIGFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20
+; CIGFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
+; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CIGFX89-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
+; CIGFX89-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28
+; CIGFX89-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: s_waitcnt vmcnt(8)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_v32i32_v4i32_v4f32:
; GFX11: ; %bb.0:
@@ -3608,7 +3242,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64
+; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60
+; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16
+; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12
+; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
+; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -3617,37 +3258,30 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
+; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
+; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
+; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
+; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40
+; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32
-; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28
-; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
-; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20
; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48
-; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44
-; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40
-; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3657,7 +3291,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -3666,37 +3307,30 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56
-; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32
-; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
-; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3706,7 +3340,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16
+; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
+; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -3715,41 +3356,31 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32
-; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48
-; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40
-; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -3817,9 +3448,16 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64
+; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60
+; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48
+; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44
+; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40
+; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -3828,61 +3466,54 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48
-; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44
-; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40
-; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36
+; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36
+; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
+; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
+; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
+; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
+; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32
-; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28
-; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
-; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20
; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96
; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92
; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88
; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84
+; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112
+; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
+; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112
-; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108
-; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104
-; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
-; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116
-; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100
+; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128
+; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124
+; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120
+; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
+; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80
+; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76
+; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68
+; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80
-; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76
-; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72
-; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3890,9 +3521,16 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -3901,61 +3539,54 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56
-; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32
-; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
-; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120
-; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116
-; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80
+; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80
-; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72
-; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3963,9 +3594,16 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48
+; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40
+; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -3974,69 +3612,57 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32
-; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104
-; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80
+; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72
+; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80
-; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -4323,7 +3949,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
+; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:48
+; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52
+; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56
+; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
+; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40
+; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -4332,61 +3965,54 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56
+; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:32
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20
+; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:24
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36
-; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40
-; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44
-; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:28
+; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16
+; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12
+; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8
+; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32
-; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
-; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24
-; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16
; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v33, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v32, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v36, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v34, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v14, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v38, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v37, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v8, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v10, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v9, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v11, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v4, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v5, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v14, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v6, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -4396,7 +4022,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:64
+; VI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:48
+; VI-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56
+; VI-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -4405,61 +4038,54 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64
-; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48
-; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52
-; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56
+; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:32
+; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:24
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40
-; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:16
+; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:44
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32
-; VI-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24
-; VI-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v33, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v32, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v36, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v35, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v34, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v14, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v38, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v37, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v8, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v10, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v9, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v11, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v4, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v5, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v14, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v6, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -4469,7 +4095,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:64
+; GFX9-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:48
+; GFX9-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56
+; GFX9-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40
+; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -4478,65 +4111,56 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64
-; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48
-; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52
-; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56
-; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:32
+; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:24
+; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:44
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40
-; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:16
+; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32
-; GFX9-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24
-; GFX9-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v33, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v32, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v36, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v35, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v34, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v14, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v38, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v37, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v8, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v10, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v9, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v11, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v4, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v5, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v14, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v6, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 401cbce00ac9a8..ac9f56d1ee7b15 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1497,8 +1497,8 @@ define <33 x i32> @v33i32_func_void() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112
; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96
; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
-; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128
; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64
+; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128
; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48
; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32
; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16
@@ -1519,13 +1519,13 @@ define <33 x i32> @v33i32_func_void() #0 {
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80
; GFX9-NEXT: s_waitcnt vmcnt(17)
-; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72
; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68
; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64
; GFX9-NEXT: s_waitcnt vmcnt(20)
+; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT: s_waitcnt vmcnt(20)
; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60
; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56
; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52
@@ -1780,8 +1780,8 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112
; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96
; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
-; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128
; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64
+; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128
; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48
; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32
; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16
@@ -1802,13 +1802,13 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80
; GFX9-NEXT: s_waitcnt vmcnt(17)
-; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72
; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68
; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64
; GFX9-NEXT: s_waitcnt vmcnt(20)
+; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT: s_waitcnt vmcnt(20)
; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60
; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56
; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52
@@ -2063,8 +2063,8 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240
; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224
; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208
-; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0
; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192
+; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0
; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176
; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160
; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144
@@ -2085,13 +2085,13 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208
; GFX9-NEXT: s_waitcnt vmcnt(17)
-; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:204
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200
; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:196
; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:192
; GFX9-NEXT: s_waitcnt vmcnt(20)
+; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen
+; GFX9-NEXT: s_waitcnt vmcnt(20)
; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:188
; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184
; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:180
@@ -2616,21 +2616,21 @@ define <32 x bfloat> @v32bf16_func_void() #0 {
; CI-NEXT: v_mov_b32_e32 v9, v1
; CI-NEXT: v_mov_b32_e32 v10, v2
; CI-NEXT: v_mov_b32_e32 v11, v3
-; CI-NEXT: v_mov_b32_e32 v12, v4
-; CI-NEXT: v_mov_b32_e32 v13, v5
-; CI-NEXT: v_mov_b32_e32 v14, v6
; CI-NEXT: v_mov_b32_e32 v16, v0
; CI-NEXT: v_mov_b32_e32 v17, v1
; CI-NEXT: v_mov_b32_e32 v18, v2
; CI-NEXT: v_mov_b32_e32 v19, v3
-; CI-NEXT: v_mov_b32_e32 v20, v4
-; CI-NEXT: v_mov_b32_e32 v21, v5
; CI-NEXT: v_mov_b32_e32 v24, v0
; CI-NEXT: v_mov_b32_e32 v25, v1
; CI-NEXT: v_mov_b32_e32 v26, v2
; CI-NEXT: v_mov_b32_e32 v27, v3
+; CI-NEXT: v_mov_b32_e32 v12, v4
+; CI-NEXT: v_mov_b32_e32 v20, v4
; CI-NEXT: v_mov_b32_e32 v28, v4
+; CI-NEXT: v_mov_b32_e32 v13, v5
+; CI-NEXT: v_mov_b32_e32 v21, v5
; CI-NEXT: v_mov_b32_e32 v29, v5
+; CI-NEXT: v_mov_b32_e32 v14, v6
; CI-NEXT: v_mov_b32_e32 v22, v6
; CI-NEXT: v_mov_b32_e32 v30, v6
; CI-NEXT: v_mov_b32_e32 v15, v7
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 545a9af3f9a0bd..5ccbc85f46dd40 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -5227,19 +5227,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31
; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v27
+; GFX9-NEXT: v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25
+; GFX9-NEXT: v_or_b32_sdwa v4, v26, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v25
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27
-; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v21
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v23
+; GFX9-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13
; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9
-; GFX9-NEXT: v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v17
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 7d07641f455e3f..c3ab9c23d1950b 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -2379,140 +2379,128 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX10-LABEL: return_72xi32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX10-NEXT: s_clause 0x14
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132
-; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:136
-; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140
-; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144
-; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148
-; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:156
-; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160
-; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96
-; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100
-; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104
-; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108
-; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112
-; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116
-; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120
-; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124
-; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
-; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68
-; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72
-; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76
+; GFX10-NEXT: s_clause 0x7
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72
+; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
+; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:88
+; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92
; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:120
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84
; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:112
-; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88
; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:108
-; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92
+; GFX10-NEXT: s_clause 0x7
+; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128
+; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132
+; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:136
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140
+; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:144
+; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:148
+; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:152
+; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:156
; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104
-; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32
; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:100
-; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36
; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:96
-; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:40
; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:92
-; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:88
-; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:84
-; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:52
; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:80
-; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56
; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:76
-; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60
; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:72
-; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28
+; GFX10-NEXT: s_clause 0x7
+; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96
+; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100
+; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104
+; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108
+; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112
+; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
+; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120
+; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124
; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:68
-; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:64
-; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:16
; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:60
-; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20
; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56
-; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:24
; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:52
-; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48
-; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8
; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44
-; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT: s_clause 0x7
+; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32
+; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36
+; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40
+; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44
+; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48
+; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
+; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56
+; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60
; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36
; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32
; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28
; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT: s_clause 0x3
+; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
+; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20
+; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT: s_clause 0x3
+; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160
+; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28
; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(32)
-; GFX10-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:284
-; GFX10-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:280
-; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:276
-; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:272
-; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:268
-; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:264
-; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:260
-; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:256
-; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:252
-; GFX10-NEXT: s_waitcnt vmcnt(24)
-; GFX10-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen offset:248
-; GFX10-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen offset:244
-; GFX10-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen offset:240
-; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:236
-; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:232
-; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:228
-; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:224
-; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:220
-; GFX10-NEXT: s_waitcnt vmcnt(16)
-; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:216
-; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:212
-; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:208
-; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:204
-; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:200
-; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:196
-; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:192
-; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:188
-; GFX10-NEXT: s_waitcnt vmcnt(8)
-; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:184
-; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:180
-; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:176
-; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:172
-; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:168
-; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:164
-; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:160
-; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:156
-; GFX10-NEXT: s_waitcnt vmcnt(7)
-; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152
-; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:148
-; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:144
-; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:140
-; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:136
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:284
+; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:280
+; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:276
+; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:272
+; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:268
+; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:264
+; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:260
+; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:256
+; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:252
+; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:248
+; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:244
+; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:240
+; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:236
+; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:232
+; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:228
+; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:224
+; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:220
+; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:216
+; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:212
+; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:208
+; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:204
+; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:200
+; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:196
+; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:192
+; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:188
+; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:184
+; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:180
+; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:176
+; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:172
+; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:168
+; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164
+; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160
+; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:156
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:132
-; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:152
+; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:148
+; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:144
+; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:140
+; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:136
+; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:132
+; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:128
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:124
+; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:124
; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX10-NEXT: s_clause 0x4
-; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164
-; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168
-; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172
-; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: return_72xi32:
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index fbb54893d9b2ac..a2fca33af10464 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -1253,57 +1253,57 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s4, s2, 16
+; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: s_addc_u32 s5, s3, 0
; CI-NEXT: v_mov_b32_e32 v0, s4
-; CI-NEXT: v_mov_b32_e32 v5, s3
-; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: v_mov_b32_e32 v4, s2
+; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; CI-NEXT: s_add_u32 s2, s0, 16
; CI-NEXT: s_addc_u32 s3, s1, 0
+; CI-NEXT: v_mov_b32_e32 v14, s3
+; CI-NEXT: v_mov_b32_e32 v13, s2
+; CI-NEXT: s_add_u32 s2, s0, 48
+; CI-NEXT: s_addc_u32 s3, s1, 0
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v1
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v18, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v16, v6
-; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v7
-; CI-NEXT: v_lshrrev_b32_e32 v25, 16, v6
-; CI-NEXT: v_mov_b32_e32 v7, s3
-; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5
-; CI-NEXT: v_mov_b32_e32 v6, s2
-; CI-NEXT: s_add_u32 s2, s0, 48
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v1
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v6
+; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v6
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v2
-; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5
+; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4
+; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12]
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v3
+; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v2
+; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
; CI-NEXT: v_cvt_f32_f16_e32 v2, v5
; CI-NEXT: v_cvt_f32_f16_e32 v0, v4
-; CI-NEXT: v_lshrrev_b32_e32 v24, 16, v4
; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v3, v17
-; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; CI-NEXT: v_cvt_f32_f16_e32 v17, v25
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v16
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v17
; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v1
; CI-NEXT: s_add_u32 s0, s0, 32
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
; CI-NEXT: s_addc_u32 s1, s1, 0
-; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT: v_mov_b32_e32 v21, s3
-; CI-NEXT: v_mov_b32_e32 v23, s1
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_mov_b32_e32 v20, s2
-; CI-NEXT: v_mov_b32_e32 v22, s0
-; CI-NEXT: flat_store_dwordx4 v[6:7], v[16:19]
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_mov_b32_e32 v15, s3
+; CI-NEXT: v_mov_b32_e32 v17, s1
+; CI-NEXT: v_mov_b32_e32 v14, s2
+; CI-NEXT: v_mov_b32_e32 v16, s0
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; CI-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
-; CI-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
+; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
+; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9]
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_extload_v16f16_to_v16f32:
@@ -1312,26 +1312,24 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: s_add_u32 s2, s2, 16
; VI-NEXT: s_addc_u32 s3, s3, 0
-; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v23, s3
-; VI-NEXT: v_mov_b32_e32 v22, s2
+; VI-NEXT: v_mov_b32_e32 v19, s3
+; VI-NEXT: v_mov_b32_e32 v18, s2
; VI-NEXT: s_add_u32 s2, s0, 48
-; VI-NEXT: v_mov_b32_e32 v21, s1
+; VI-NEXT: v_mov_b32_e32 v17, s1
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v20, s0
+; VI-NEXT: v_mov_b32_e32 v16, s0
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v25, s3
-; VI-NEXT: v_mov_b32_e32 v27, s1
-; VI-NEXT: v_mov_b32_e32 v24, s2
-; VI-NEXT: v_mov_b32_e32 v26, s0
+; VI-NEXT: v_mov_b32_e32 v21, s3
+; VI-NEXT: v_mov_b32_e32 v20, s2
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_f16_e32 v14, v3
; VI-NEXT: v_cvt_f32_f16_e32 v12, v2
@@ -1341,19 +1339,21 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; VI-NEXT: v_cvt_f32_f16_e32 v8, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v18, v7
-; VI-NEXT: v_cvt_f32_f16_e32 v16, v6
-; VI-NEXT: v_cvt_f32_f16_sdwa v19, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_sdwa v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_f16_e32 v2, v5
+; VI-NEXT: v_cvt_f32_f16_e32 v14, v7
+; VI-NEXT: v_cvt_f32_f16_e32 v12, v6
+; VI-NEXT: v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_f32_f16_e32 v0, v4
; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: flat_store_dwordx4 v[22:23], v[12:15]
-; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
-; VI-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
-; VI-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; VI-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: global_extload_v16f16_to_v16f32:
@@ -1665,43 +1665,43 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: s_add_u32 s2, s0, 48
; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: v_mov_b32_e32 v19, s3
-; CI-NEXT: v_mov_b32_e32 v18, s2
+; CI-NEXT: v_mov_b32_e32 v7, s3
+; CI-NEXT: v_mov_b32_e32 v6, s2
; CI-NEXT: s_add_u32 s2, s0, 32
-; CI-NEXT: v_mov_b32_e32 v17, s1
+; CI-NEXT: v_mov_b32_e32 v13, s1
; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: v_mov_b32_e32 v16, s0
+; CI-NEXT: v_mov_b32_e32 v12, s0
; CI-NEXT: s_add_u32 s0, s0, 16
+; CI-NEXT: v_mov_b32_e32 v15, s3
; CI-NEXT: s_addc_u32 s1, s1, 0
-; CI-NEXT: v_mov_b32_e32 v21, s3
-; CI-NEXT: v_mov_b32_e32 v23, s1
-; CI-NEXT: v_mov_b32_e32 v20, s2
-; CI-NEXT: v_mov_b32_e32 v22, s0
+; CI-NEXT: v_mov_b32_e32 v14, s2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v1
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v24, v1
-; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v3
-; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v10
-; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
-; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
-; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
-; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v24
-; CI-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
-; CI-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
-; CI-NEXT: flat_store_dwordx4 v[22:23], v[4:7]
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v2
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v4
+; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v1
+; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v5
+; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
+; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v18, v11
+; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
+; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10
+; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16
+; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17
+; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18
+; CI-NEXT: v_mov_b32_e32 v17, s1
+; CI-NEXT: v_mov_b32_e32 v16, s0
+; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_extload_v8f16_to_v8f64:
@@ -1713,39 +1713,39 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: s_add_u32 s2, s0, 48
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v19, s3
-; VI-NEXT: v_mov_b32_e32 v18, s2
+; VI-NEXT: v_mov_b32_e32 v8, s3
+; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: s_add_u32 s2, s0, 32
-; VI-NEXT: v_mov_b32_e32 v17, s1
+; VI-NEXT: v_mov_b32_e32 v13, s1
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v16, s0
+; VI-NEXT: v_mov_b32_e32 v12, s0
; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: v_mov_b32_e32 v15, s3
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v21, s3
-; VI-NEXT: v_mov_b32_e32 v23, s1
-; VI-NEXT: v_mov_b32_e32 v20, s2
-; VI-NEXT: v_mov_b32_e32 v22, s0
+; VI-NEXT: v_mov_b32_e32 v14, s2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v10, v3
-; VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_e32 v7, v2
+; VI-NEXT: v_cvt_f32_f16_e32 v9, v0
+; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f32_f16_e32 v0, v3
+; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f32_f16_e32 v10, v1
+; VI-NEXT: v_cvt_f32_f16_e32 v11, v2
+; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0
+; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_e32 v4, v0
-; VI-NEXT: v_cvt_f32_f16_e32 v5, v1
-; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_sdwa v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v10
-; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v3
-; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v7
+; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9
+; VI-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
+; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v11
+; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
-; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
-; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
-; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v24
-; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
-; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
-; VI-NEXT: flat_store_dwordx4 v[22:23], v[4:7]
-; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17
+; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16
+; VI-NEXT: v_mov_b32_e32 v17, s1
+; VI-NEXT: v_mov_b32_e32 v16, s0
+; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
+; VI-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
+; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: global_extload_v8f16_to_v8f64:
@@ -1794,92 +1794,91 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: s_add_u32 s2, s2, 16
; CI-NEXT: s_addc_u32 s3, s3, 0
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CI-NEXT: v_mov_b32_e32 v5, s3
+; CI-NEXT: v_mov_b32_e32 v4, s2
+; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; CI-NEXT: s_add_u32 s2, s0, 48
; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: v_mov_b32_e32 v14, s3
-; CI-NEXT: v_mov_b32_e32 v13, s2
+; CI-NEXT: v_mov_b32_e32 v15, s3
+; CI-NEXT: v_mov_b32_e32 v14, s2
; CI-NEXT: s_add_u32 s2, s0, 32
; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: v_mov_b32_e32 v16, s3
-; CI-NEXT: v_mov_b32_e32 v15, s2
+; CI-NEXT: v_mov_b32_e32 v17, s3
+; CI-NEXT: v_mov_b32_e32 v16, s2
; CI-NEXT: s_add_u32 s2, s0, 16
; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: v_mov_b32_e32 v18, s3
-; CI-NEXT: v_mov_b32_e32 v17, s2
+; CI-NEXT: v_mov_b32_e32 v19, s3
+; CI-NEXT: v_mov_b32_e32 v18, s2
; CI-NEXT: s_add_u32 s2, s0, 0x70
; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: v_mov_b32_e32 v12, s1
-; CI-NEXT: v_mov_b32_e32 v11, s0
+; CI-NEXT: v_mov_b32_e32 v13, s1
+; CI-NEXT: v_mov_b32_e32 v12, s0
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v8
-; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
-; CI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
+; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v8
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v21, v0
-; CI-NEXT: flat_store_dwordx4 v[13:14], v[7:10]
-; CI-NEXT: s_nop 0
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v19
-; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3
-; CI-NEXT: v_mov_b32_e32 v14, s3
-; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; CI-NEXT: v_mov_b32_e32 v13, s2
-; CI-NEXT: s_add_u32 s2, s0, 0x60
-; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: flat_store_dwordx4 v[15:16], v[6:9]
-; CI-NEXT: v_mov_b32_e32 v16, s3
-; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v4
-; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
-; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5
+; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3
+; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v5
+; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
+; CI-NEXT: v_mov_b32_e32 v15, s3
+; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; CI-NEXT: v_mov_b32_e32 v14, s2
+; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v0
+; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
+; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_mov_b32_e32 v15, s2
-; CI-NEXT: s_add_u32 s2, s0, 0x50
-; CI-NEXT: flat_store_dwordx4 v[17:18], v[4:7]
-; CI-NEXT: v_cvt_f32_f16_e32 v17, v1
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v3
-; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v2
-; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
+; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6
; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9
; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
; CI-NEXT: v_cvt_f32_f16_e32 v8, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: flat_store_dwordx4 v[11:12], v[0:3]
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v18
-; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
+; CI-NEXT: s_add_u32 s2, s0, 0x60
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v11
+; CI-NEXT: s_addc_u32 s3, s1, 0
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
+; CI-NEXT: v_mov_b32_e32 v17, s3
+; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7
; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v5
+; CI-NEXT: v_mov_b32_e32 v16, s2
+; CI-NEXT: s_add_u32 s2, s0, 0x50
; CI-NEXT: s_addc_u32 s3, s1, 0
; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; CI-NEXT: s_add_u32 s0, s0, 64
-; CI-NEXT: flat_store_dwordx4 v[13:14], v[0:3]
+; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
; CI-NEXT: s_addc_u32 s1, s1, 0
-; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v17
+; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21
; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
-; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v21
+; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12
-; CI-NEXT: v_mov_b32_e32 v20, s3
+; CI-NEXT: v_mov_b32_e32 v19, s3
; CI-NEXT: v_mov_b32_e32 v13, s1
-; CI-NEXT: v_mov_b32_e32 v19, s2
+; CI-NEXT: v_mov_b32_e32 v18, s2
; CI-NEXT: v_mov_b32_e32 v12, s0
-; CI-NEXT: flat_store_dwordx4 v[15:16], v[8:11]
-; CI-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
+; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; CI-NEXT: s_endpgm
;
@@ -1897,76 +1896,77 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: s_add_u32 s2, s0, 48
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v9, s3
-; VI-NEXT: v_mov_b32_e32 v8, s2
+; VI-NEXT: v_mov_b32_e32 v14, s3
+; VI-NEXT: v_mov_b32_e32 v13, s2
; VI-NEXT: s_add_u32 s2, s0, 32
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v13, s3
-; VI-NEXT: v_mov_b32_e32 v12, s2
+; VI-NEXT: v_mov_b32_e32 v16, s3
+; VI-NEXT: v_mov_b32_e32 v15, s2
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v15, s3
-; VI-NEXT: v_mov_b32_e32 v14, s2
+; VI-NEXT: v_mov_b32_e32 v18, s3
+; VI-NEXT: v_mov_b32_e32 v17, s2
; VI-NEXT: s_add_u32 s2, s0, 0x50
+; VI-NEXT: v_mov_b32_e32 v12, s1
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v17, s3
-; VI-NEXT: v_mov_b32_e32 v16, s2
+; VI-NEXT: v_mov_b32_e32 v11, s0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_cvt_f32_f16_e32 v8, v7
+; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8
+; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
+; VI-NEXT: flat_store_dwordx4 v[13:14], v[7:10]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
+; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_cvt_f32_f16_e32 v10, v2
+; VI-NEXT: v_mov_b32_e32 v14, s3
+; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
+; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; VI-NEXT: v_mov_b32_e32 v13, s2
; VI-NEXT: s_add_u32 s2, s0, 64
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v19, s3
-; VI-NEXT: v_mov_b32_e32 v11, s1
-; VI-NEXT: v_mov_b32_e32 v18, s2
+; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9]
+; VI-NEXT: v_mov_b32_e32 v16, s3
+; VI-NEXT: v_cvt_f32_f16_e32 v6, v5
+; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f32_f16_e32 v8, v4
+; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
+; VI-NEXT: v_mov_b32_e32 v15, s2
; VI-NEXT: s_add_u32 s2, s0, 0x70
-; VI-NEXT: v_mov_b32_e32 v10, s0
; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7]
+; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8
+; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9
+; VI-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f32_f16_e32 v2, v1
+; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
+; VI-NEXT: v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f32_f16_e32 v7, v3
+; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v9
+; VI-NEXT: v_cvt_f32_f16_e32 v9, v0
+; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v2
+; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10
+; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
+; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
; VI-NEXT: s_add_u32 s0, s0, 0x60
+; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4]
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_f16_e32 v22, v4
-; VI-NEXT: v_cvt_f32_f16_sdwa v23, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_e32 v4, v7
-; VI-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_e32 v24, v5
-; VI-NEXT: v_cvt_f32_f16_sdwa v25, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_e32 v20, v6
-; VI-NEXT: v_cvt_f32_f16_sdwa v21, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
-; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v26, v2
-; VI-NEXT: v_cvt_f32_f16_sdwa v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_sdwa v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; VI-NEXT: v_cvt_f32_f16_e32 v8, v3
-; VI-NEXT: v_cvt_f32_f16_e32 v29, v0
-; VI-NEXT: v_cvt_f32_f16_sdwa v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_e32 v31, v1
-; VI-NEXT: v_cvt_f32_f16_sdwa v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v20
-; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v21
-; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v22
-; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v23
-; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v31
-; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v24
-; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v25
-; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v28
-; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v29
-; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
-; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v32
-; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v30
-; VI-NEXT: v_mov_b32_e32 v21, s3
-; VI-NEXT: v_mov_b32_e32 v23, s1
-; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v26
-; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v27
-; VI-NEXT: v_mov_b32_e32 v20, s2
-; VI-NEXT: v_mov_b32_e32 v22, s0
-; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; VI-NEXT: flat_store_dwordx4 v[18:19], v[4:7]
-; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
-; VI-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
+; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7
+; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17
+; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8
+; VI-NEXT: v_mov_b32_e32 v20, s3
+; VI-NEXT: v_mov_b32_e32 v14, s1
+; VI-NEXT: v_mov_b32_e32 v19, s2
+; VI-NEXT: v_mov_b32_e32 v13, s0
+; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12]
+; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
+; VI-NEXT: flat_store_dwordx4 v[13:14], v[5:8]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: global_extload_v16f16_to_v16f64:
@@ -2368,52 +2368,51 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v13, s3
-; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; CI-NEXT: v_mov_b32_e32 v12, s2
+; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; CI-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; CI-NEXT: s_add_u32 s2, s0, 16
; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: v_mov_b32_e32 v17, s3
-; CI-NEXT: v_mov_b32_e32 v16, s2
; CI-NEXT: s_waitcnt vmcnt(3)
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: s_waitcnt vmcnt(2)
; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v5
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v4
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1
; CI-NEXT: v_or_b32_e32 v1, v2, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16
+; CI-NEXT: v_mov_b32_e32 v4, s2
; CI-NEXT: v_or_b32_e32 v0, v0, v18
; CI-NEXT: v_or_b32_e32 v3, v6, v2
-; CI-NEXT: v_or_b32_e32 v2, v4, v5
-; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v11
-; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v9
-; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v15
-; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v13
-; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
-; CI-NEXT: s_nop 0
-; CI-NEXT: v_or_b32_e32 v1, v10, v4
-; CI-NEXT: v_or_b32_e32 v0, v8, v5
+; CI-NEXT: v_or_b32_e32 v2, v17, v7
+; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11
+; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9
+; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v15
+; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v13
+; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_or_b32_e32 v3, v14, v6
-; CI-NEXT: v_or_b32_e32 v2, v12, v7
+; CI-NEXT: v_or_b32_e32 v1, v10, v6
+; CI-NEXT: v_or_b32_e32 v0, v8, v7
+; CI-NEXT: v_or_b32_e32 v3, v14, v9
+; CI-NEXT: v_or_b32_e32 v2, v12, v11
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
@@ -2429,31 +2428,29 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; VI-NEXT: s_add_u32 s4, s2, 48
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v8, s2
; VI-NEXT: s_add_u32 s2, s2, 16
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v13, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v13, s3
; VI-NEXT: v_mov_b32_e32 v12, s2
; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v17, s3
-; VI-NEXT: v_mov_b32_e32 v16, s2
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; VI-NEXT: v_cvt_f16_f32_sdwa v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT: v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT: v_cvt_f16_f32_e32 v18, v4
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v10, v10
@@ -2464,17 +2461,19 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; VI-NEXT: v_cvt_f16_f32_e32 v14, v14
; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_or_b32_e32 v1, v2, v3
-; VI-NEXT: v_or_b32_e32 v0, v0, v18
+; VI-NEXT: v_or_b32_e32 v0, v0, v16
; VI-NEXT: v_or_b32_e32 v3, v6, v7
-; VI-NEXT: v_or_b32_e32 v2, v4, v5
+; VI-NEXT: v_or_b32_e32 v2, v18, v17
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_or_b32_e32 v1, v10, v11
; VI-NEXT: v_or_b32_e32 v0, v8, v9
; VI-NEXT: v_or_b32_e32 v3, v14, v15
; VI-NEXT: v_or_b32_e32 v2, v12, v13
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index add62a5c39cb14..b9d3763e7def10 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -2678,7 +2678,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14
; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
-; GFX8-NEXT: v_mul_lo_u16_e32 v20, v16, v18
+; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15
+; GFX8-NEXT: v_mul_lo_u16_e32 v15, v16, v18
; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
@@ -2686,8 +2687,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19
; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11
-; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15
+; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index e71c6cf71c8823..74020c43a3ca3f 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1111,16 +1111,13 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out,
; GENERIC-LABEL: extract_neg_offset_sgpr_loaded:
; GENERIC: ; %bb.0: ; %entry
; GENERIC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
-; GENERIC-NEXT: s_load_dword s2, s[4:5], 0x39
; GENERIC-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x29
-; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_load_dword s2, s[4:5], 0x39
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT: s_addk_i32 s2, 0xfe00
-; GENERIC-NEXT: s_or_b32 s4, s23, s51
-; GENERIC-NEXT: s_or_b32 s5, s22, s50
-; GENERIC-NEXT: s_or_b32 s6, s21, s49
-; GENERIC-NEXT: s_or_b32 s7, s20, s48
+; GENERIC-NEXT: s_or_b32 s6, s23, s51
+; GENERIC-NEXT: s_or_b32 s7, s22, s50
+; GENERIC-NEXT: s_or_b32 s21, s21, s49
+; GENERIC-NEXT: s_or_b32 s20, s20, s48
; GENERIC-NEXT: s_or_b32 s19, s19, s47
; GENERIC-NEXT: s_or_b32 s18, s18, s46
; GENERIC-NEXT: s_or_b32 s17, s17, s45
@@ -1133,38 +1130,42 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out,
; GENERIC-NEXT: s_or_b32 s10, s10, s38
; GENERIC-NEXT: s_or_b32 s8, s8, s36
; GENERIC-NEXT: s_or_b32 s9, s9, s37
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_addk_i32 s2, 0xfe00
; GENERIC-NEXT: s_cmp_eq_u32 s2, 1
-; GENERIC-NEXT: s_cselect_b32 s8, s9, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s9, s8
; GENERIC-NEXT: s_cmp_eq_u32 s2, 2
-; GENERIC-NEXT: s_cselect_b32 s8, s10, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s10, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 3
-; GENERIC-NEXT: s_cselect_b32 s8, s11, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s11, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 4
-; GENERIC-NEXT: s_cselect_b32 s8, s12, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s12, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 5
-; GENERIC-NEXT: s_cselect_b32 s8, s13, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s13, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 6
-; GENERIC-NEXT: s_cselect_b32 s8, s14, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s14, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 7
-; GENERIC-NEXT: s_cselect_b32 s8, s15, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s15, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 8
-; GENERIC-NEXT: s_cselect_b32 s8, s16, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s16, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 9
-; GENERIC-NEXT: s_cselect_b32 s8, s17, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s17, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 10
-; GENERIC-NEXT: s_cselect_b32 s8, s18, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s18, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 11
-; GENERIC-NEXT: s_cselect_b32 s8, s19, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s19, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 12
-; GENERIC-NEXT: s_cselect_b32 s7, s7, s8
+; GENERIC-NEXT: s_cselect_b32 s4, s20, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 13
-; GENERIC-NEXT: s_cselect_b32 s6, s6, s7
+; GENERIC-NEXT: s_cselect_b32 s4, s21, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 14
-; GENERIC-NEXT: s_cselect_b32 s5, s5, s6
+; GENERIC-NEXT: s_cselect_b32 s4, s7, s4
; GENERIC-NEXT: s_cmp_eq_u32 s2, 15
-; GENERIC-NEXT: s_cselect_b32 s4, s4, s5
+; GENERIC-NEXT: s_cselect_b32 s4, s6, s4
; GENERIC-NEXT: s_mov_b32 s2, -1
; GENERIC-NEXT: v_mov_b32_e32 v0, s4
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GENERIC-NEXT: s_endpgm
;
@@ -1278,9 +1279,9 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out,
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT: s_or_b32 s8, s8, s36
-; SI-MOVREL-NEXT: s_or_b32 s5, s23, s51
-; SI-MOVREL-NEXT: s_or_b32 s6, s22, s50
-; SI-MOVREL-NEXT: s_or_b32 s7, s21, s49
+; SI-MOVREL-NEXT: s_or_b32 s6, s23, s51
+; SI-MOVREL-NEXT: s_or_b32 s7, s22, s50
+; SI-MOVREL-NEXT: s_or_b32 s21, s21, s49
; SI-MOVREL-NEXT: s_or_b32 s20, s20, s48
; SI-MOVREL-NEXT: s_or_b32 s19, s19, s47
; SI-MOVREL-NEXT: s_or_b32 s18, s18, s46
@@ -1307,9 +1308,9 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out,
; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18
; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19
; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20
-; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s7
-; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s6
-; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s21
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s6
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-MOVREL-NEXT: s_endpgm
@@ -5699,94 +5700,94 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
; GENERIC-NEXT: v_mov_b32_e32 v2, 0
; GENERIC-NEXT: s_mov_b32 s27, s3
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT: buffer_load_dword v2, v[1:2], s[24:27], 0 addr64 glc
+; GENERIC-NEXT: buffer_load_dword v14, v[1:2], s[24:27], 0 addr64 glc
; GENERIC-NEXT: s_waitcnt vmcnt(0)
-; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT: s_mov_b32 s2, -1
; GENERIC-NEXT: ;;#ASMSTART
; GENERIC-NEXT: v_mov_b32 v1, 62
; GENERIC-NEXT: ;;#ASMEND
-; GENERIC-NEXT: v_mov_b32_e32 v3, s20
-; GENERIC-NEXT: v_mov_b32_e32 v4, s21
-; GENERIC-NEXT: v_mov_b32_e32 v5, s22
-; GENERIC-NEXT: v_mov_b32_e32 v6, s23
-; GENERIC-NEXT: v_mov_b32_e32 v7, s16
-; GENERIC-NEXT: v_mov_b32_e32 v8, s17
-; GENERIC-NEXT: v_mov_b32_e32 v9, s18
-; GENERIC-NEXT: v_mov_b32_e32 v10, s19
-; GENERIC-NEXT: v_mov_b32_e32 v11, s12
-; GENERIC-NEXT: v_mov_b32_e32 v12, s13
-; GENERIC-NEXT: v_mov_b32_e32 v13, s14
-; GENERIC-NEXT: v_mov_b32_e32 v14, s15
-; GENERIC-NEXT: v_mov_b32_e32 v15, s8
-; GENERIC-NEXT: v_mov_b32_e32 v16, s9
-; GENERIC-NEXT: v_mov_b32_e32 v17, s10
-; GENERIC-NEXT: v_mov_b32_e32 v18, s11
-; GENERIC-NEXT: v_add_i32_e32 v19, vcc, 1, v2
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc
-; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
-; GENERIC-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19
+; GENERIC-NEXT: v_mov_b32_e32 v10, s22
+; GENERIC-NEXT: v_mov_b32_e32 v11, s23
+; GENERIC-NEXT: v_mov_b32_e32 v15, s16
+; GENERIC-NEXT: v_mov_b32_e32 v2, s18
+; GENERIC-NEXT: v_mov_b32_e32 v3, s19
+; GENERIC-NEXT: v_mov_b32_e32 v4, s12
+; GENERIC-NEXT: v_mov_b32_e32 v5, s13
+; GENERIC-NEXT: v_mov_b32_e32 v6, s14
+; GENERIC-NEXT: v_mov_b32_e32 v7, s15
+; GENERIC-NEXT: v_mov_b32_e32 v8, s8
+; GENERIC-NEXT: v_mov_b32_e32 v9, s9
+; GENERIC-NEXT: v_mov_b32_e32 v12, s10
+; GENERIC-NEXT: v_mov_b32_e32 v13, s11
+; GENERIC-NEXT: v_add_i32_e32 v18, vcc, 1, v14
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18
; GENERIC-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18
; GENERIC-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc
-; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19
-; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v16, s17
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18
+; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18
+; GENERIC-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1]
+; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v15, s21
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14
+; GENERIC-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v19, s20
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18
+; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
; GENERIC-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48
; GENERIC-NEXT: s_waitcnt vmcnt(0)
@@ -6257,97 +6258,98 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
+; SI-MOVREL-NEXT: buffer_load_dword v14, v[1:2], s[8:11], 0 addr64 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
-; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-MOVREL-NEXT: ;;#ASMSTART
; SI-MOVREL-NEXT: v_mov_b32 v1, 62
; SI-MOVREL-NEXT: ;;#ASMEND
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s20
-; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s21
-; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s22
-; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s23
-; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s16
-; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s17
-; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s18
-; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s19
-; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s12
-; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s13
-; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s14
-; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s15
-; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s8
-; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s9
-; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s10
-; SI-MOVREL-NEXT: v_mov_b32_e32 v18, s11
-; SI-MOVREL-NEXT: v_add_i32_e32 v19, vcc, 1, v2
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s18
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s19
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s12
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s13
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s14
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s15
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s8
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s10
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s11
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s22
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s23
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s16
+; SI-MOVREL-NEXT: v_add_i32_e32 v18, vcc, 1, v14
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18
; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18
; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc
-; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19
-; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14
+; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s17
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1]
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s21
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc
+; SI-MOVREL-NEXT: v_mov_b32_e32 v19, s20
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc
; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32
@@ -6368,104 +6370,104 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: flat_load_dword v2, v[1:2] glc
+; VI-NEXT: flat_load_dword v14, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, s18
+; VI-NEXT: v_mov_b32_e32 v3, s19
; VI-NEXT: ;;#ASMSTART
; VI-NEXT: v_mov_b32 v1, 62
; VI-NEXT: ;;#ASMEND
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: v_mov_b32_e32 v6, s14
+; VI-NEXT: v_mov_b32_e32 v7, s15
+; VI-NEXT: v_mov_b32_e32 v8, s8
+; VI-NEXT: v_mov_b32_e32 v9, s9
+; VI-NEXT: v_mov_b32_e32 v12, s10
+; VI-NEXT: v_mov_b32_e32 v13, s11
+; VI-NEXT: v_mov_b32_e32 v10, s22
+; VI-NEXT: v_mov_b32_e32 v11, s23
+; VI-NEXT: v_mov_b32_e32 v15, s16
+; VI-NEXT: v_add_u32_e32 v18, vcc, 1, v14
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14
+; VI-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14
+; VI-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14
+; VI-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14
+; VI-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14
+; VI-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
+; VI-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
+; VI-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14
+; VI-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14
+; VI-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18
+; VI-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18
+; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18
+; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; VI-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18
+; VI-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18
+; VI-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18
+; VI-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18
+; VI-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18
+; VI-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18
+; VI-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14
+; VI-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14
+; VI-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18
+; VI-NEXT: v_mov_b32_e32 v16, s17
+; VI-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14
+; VI-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18
+; VI-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18
+; VI-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18
+; VI-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc
+; VI-NEXT: v_mov_b32_e32 v15, s21
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14
+; VI-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s20
-; VI-NEXT: v_mov_b32_e32 v4, s21
-; VI-NEXT: v_mov_b32_e32 v5, s22
-; VI-NEXT: v_mov_b32_e32 v6, s23
-; VI-NEXT: v_mov_b32_e32 v7, s16
-; VI-NEXT: v_mov_b32_e32 v8, s17
-; VI-NEXT: v_mov_b32_e32 v9, s18
-; VI-NEXT: v_mov_b32_e32 v10, s19
-; VI-NEXT: v_mov_b32_e32 v11, s12
-; VI-NEXT: v_mov_b32_e32 v12, s13
-; VI-NEXT: v_mov_b32_e32 v13, s14
-; VI-NEXT: v_mov_b32_e32 v14, s15
-; VI-NEXT: v_mov_b32_e32 v15, s8
-; VI-NEXT: v_mov_b32_e32 v16, s9
-; VI-NEXT: v_mov_b32_e32 v17, s10
-; VI-NEXT: v_mov_b32_e32 v18, s11
; VI-NEXT: s_add_u32 s2, s0, 48
+; VI-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc
+; VI-NEXT: v_mov_b32_e32 v19, s20
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v2
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2
-; VI-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2
-; VI-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2
-; VI-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2
-; VI-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2
-; VI-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2
-; VI-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2
-; VI-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2
-; VI-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
-; VI-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
-; VI-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
-; VI-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
-; VI-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; VI-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; VI-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
-; VI-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19
-; VI-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19
-; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19
-; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; VI-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19
-; VI-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19
-; VI-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19
-; VI-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19
-; VI-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19
-; VI-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19
-; VI-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19
-; VI-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19
-; VI-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19
-; VI-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19
-; VI-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19
-; VI-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19
+; VI-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18
; VI-NEXT: v_mov_b32_e32 v19, s3
; VI-NEXT: v_mov_b32_e32 v18, s2
; VI-NEXT: s_add_u32 s2, s0, 32
-; VI-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc
+; VI-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: flat_store_dwordx4 v[18:19], v[14:17]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -6496,105 +6498,105 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
; GFX9-IDXMODE: ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-IDXMODE-NEXT: global_load_dword v3, v1, s[0:1] glc
+; GFX9-IDXMODE-NEXT: global_load_dword v14, v1, s[0:1] glc
; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-IDXMODE-NEXT: ;;#ASMSTART
; GFX9-IDXMODE-NEXT: v_mov_b32 v1, 62
; GFX9-IDXMODE-NEXT: ;;#ASMEND
-; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s20
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s21
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s22
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s23
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s16
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s17
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s18
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s19
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s12
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s13
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s14
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s15
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s8
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s9
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, s10
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s11
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v27, v10, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v11, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, v16, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3
-; GFX9-IDXMODE-NEXT: v_add_u32_e32 v20, 1, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v18, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v19, v1, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v3, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v20
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s19
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s13
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s15
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s8
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s10
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s11
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s22
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s23
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s16
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14
+; GFX9-IDXMODE-NEXT: v_add_u32_e32 v18, 1, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18
; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v20
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18
; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 63, v12, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v20
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s17
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18
; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v11, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v27, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 15, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v18, 63, v24, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc
-; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v20
-; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s21
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s20
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, 0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc
; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[15:18], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] offset:48
; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[11:14], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1] offset:32
; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[7:10], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16
; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[3:6], s[0:1]
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[2:5], s[0:1]
; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB17_2
@@ -6629,132 +6631,134 @@ bb2:
define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) {
; GENERIC-LABEL: insert_w_offset_multiple_in_block:
; GENERIC: ; %bb.0: ; %entry
-; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb
-; GENERIC-NEXT: s_mov_b32 s3, 0xf000
-; GENERIC-NEXT: s_mov_b32 s2, -1
-; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41500000
-; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41880000
-; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41600000
-; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41700000
-; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41800000
-; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41100000
-; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41200000
-; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41300000
-; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41400000
+; GENERIC-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x9
+; GENERIC-NEXT: s_load_dword s24, s[4:5], 0xb
+; GENERIC-NEXT: s_mov_b32 s31, 0xf000
+; GENERIC-NEXT: s_mov_b32 s30, -1
+; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41500000
+; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41880000
+; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41600000
+; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41700000
+; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41800000
+; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000
+; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000
+; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000
+; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000
; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000
; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40c00000
; GENERIC-NEXT: v_mov_b32_e32 v11, 0x40e00000
; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41000000
; GENERIC-NEXT: v_mov_b32_e32 v15, 0x40400000
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT: s_add_i32 s5, s4, 1
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 12
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 13
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 14
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 15
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 8
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 9
-; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 10
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 11
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 4
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 5
-; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 6
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 7
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v12, v12, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 0
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v13, 1.0, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 1
-; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v14, 2.0, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 2
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v15, v15, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 3
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v16, 4.0, v8, vcc
-; GENERIC-NEXT: s_add_i32 s4, s4, 2
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 3
-; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0
+; GENERIC-NEXT: s_add_i32 s25, s24, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 12
; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 13
+; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 14
+; GENERIC-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 15
+; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 8
+; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 9
+; GENERIC-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 10
+; GENERIC-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 11
+; GENERIC-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 4
+; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 5
+; GENERIC-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 6
+; GENERIC-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 7
+; GENERIC-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 0
+; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, v0, s[22:23]
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 1
+; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v14, 2.0, v0, s[22:23]
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 2
+; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[22:23]
+; GENERIC-NEXT: s_cmp_eq_u32 s25, 3
+; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v16, 4.0, v0, s[22:23]
+; GENERIC-NEXT: s_add_i32 s26, s24, 2
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 3
+; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[28:31], 0
+; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 2
+; GENERIC-NEXT: s_cselect_b64 s[24:25], -1, 0
; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v16, v8, v16, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 2
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v15, v8, v15, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 1
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v14, v8, v14, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 0
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v13, v8, v13, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 7
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 6
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v11, v8, v11, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 5
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 4
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 11
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 10
-; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 9
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 8
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 15
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 14
-; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 13
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 12
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64
+; GENERIC-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[22:23]
+; GENERIC-NEXT: v_cndmask_b32_e64 v15, v0, v15, s[24:25]
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 1
+; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v14, v0, v14, s[22:23]
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 0
+; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v13, v0, v13, s[22:23]
+; GENERIC-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[14:15]
+; GENERIC-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[16:17]
+; GENERIC-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[18:19]
+; GENERIC-NEXT: v_cndmask_b32_e64 v12, v12, v0, s[20:21]
+; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:16
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 7
+; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 6
+; GENERIC-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e64 v12, v0, v12, s[14:15]
+; GENERIC-NEXT: v_cndmask_b32_e64 v11, v0, v11, s[16:17]
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 5
+; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[14:15]
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 4
+; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v9, v0, v9, s[14:15]
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GENERIC-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1]
+; GENERIC-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[2:3]
+; GENERIC-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5]
+; GENERIC-NEXT: v_cndmask_b32_e64 v5, v5, v0, s[6:7]
+; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:48
+; GENERIC-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9]
+; GENERIC-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[10:11]
+; GENERIC-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[12:13]
+; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:32
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 11
+; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:80
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(1)
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 10
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v0, v7, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 14
+; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:96
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s26, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
+; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:112
+; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[28:31], 0 offset:64
; GENERIC-NEXT: s_endpgm
;
; NOOPT-LABEL: insert_w_offset_multiple_in_block:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index 48a168b4bfbe71..d5b6c19399a1f8 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -1314,108 +1314,108 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
; SI-LABEL: v_insertelement_v16bf16_dynamic:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
-; SI-NEXT: s_mov_b32 s11, 0x100f000
-; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0
+; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x4
+; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[0:1], s[14:15]
; SI-NEXT: v_mov_b32_e32 v5, 0
-; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[8:11], 0 addr64
-; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16
-; SI-NEXT: s_cmp_eq_u32 s5, 6
-; SI-NEXT: v_mov_b32_e32 v6, s4
+; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64
+; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16
+; SI-NEXT: s_cmp_eq_u32 s7, 6
+; SI-NEXT: v_mov_b32_e32 v6, s6
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 7
-; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT: s_cmp_eq_u32 s7, 7
+; SI-NEXT: s_mov_b64 s[14:15], s[2:3]
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cndmask_b32_e32 v11, v10, v6, vcc
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 4
-; SI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 5
+; SI-NEXT: s_cmp_eq_u32 s7, 4
+; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT: s_cmp_eq_u32 s7, 5
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9
-; SI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 2
+; SI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1]
+; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT: s_cmp_eq_u32 s7, 2
+; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT: s_cmp_eq_u32 s7, 3
+; SI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; SI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3]
+; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT: s_cmp_eq_u32 s7, 0
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 3
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[0:1]
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_or_b32_e32 v10, v11, v10
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 0
-; SI-NEXT: v_or_b32_e32 v9, v9, v11
-; SI-NEXT: v_cndmask_b32_e32 v11, v13, v6, vcc
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 1
+; SI-NEXT: v_cndmask_b32_e64 v12, v13, v6, s[2:3]
+; SI-NEXT: s_cmp_eq_u32 s7, 1
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 14
-; SI-NEXT: v_or_b32_e32 v8, v8, v11
-; SI-NEXT: v_cndmask_b32_e32 v11, v14, v6, vcc
+; SI-NEXT: s_cmp_eq_u32 s7, 14
+; SI-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; SI-NEXT: v_or_b32_e32 v8, v8, v12
+; SI-NEXT: v_cndmask_b32_e32 v12, v14, v6, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 15
+; SI-NEXT: s_cmp_eq_u32 s7, 15
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 12
-; SI-NEXT: v_or_b32_e32 v7, v7, v11
-; SI-NEXT: v_cndmask_b32_e32 v11, v15, v6, vcc
+; SI-NEXT: s_cmp_eq_u32 s7, 12
+; SI-NEXT: v_or_b32_e32 v7, v7, v12
+; SI-NEXT: v_cndmask_b32_e32 v12, v15, v6, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 13
+; SI-NEXT: s_cmp_eq_u32 s7, 13
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 10
-; SI-NEXT: v_or_b32_e32 v3, v3, v11
-; SI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc
+; SI-NEXT: s_cmp_eq_u32 s7, 10
+; SI-NEXT: v_or_b32_e32 v3, v3, v12
+; SI-NEXT: v_cndmask_b32_e32 v12, v16, v6, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 11
+; SI-NEXT: s_cmp_eq_u32 s7, 11
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 8
-; SI-NEXT: v_or_b32_e32 v2, v2, v11
-; SI-NEXT: v_cndmask_b32_e32 v11, v17, v6, vcc
+; SI-NEXT: s_cmp_eq_u32 s7, 8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT: v_or_b32_e32 v2, v2, v12
+; SI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 9
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; SI-NEXT: s_cmp_eq_u32 s7, 9
+; SI-NEXT: v_or_b32_e32 v9, v9, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v1, v1, v11
+; SI-NEXT: v_or_b32_e32 v1, v1, v12
; SI-NEXT: v_or_b32_e32 v0, v0, v6
-; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16
-; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64
+; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[12:15], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v16bf16_dynamic:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
@@ -1429,81 +1429,81 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
-; VI-NEXT: s_cmp_eq_u32 s5, 14
+; VI-NEXT: s_cmp_eq_u32 s7, 14
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
-; VI-NEXT: v_mov_b32_e32 v12, s4
+; VI-NEXT: v_mov_b32_e32 v12, s6
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 15
+; VI-NEXT: s_cmp_eq_u32 s7, 15
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 12
-; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 13
+; VI-NEXT: s_cmp_eq_u32 s7, 12
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_eq_u32 s7, 13
; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 10
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 11
+; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1]
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_eq_u32 s7, 10
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_cmp_eq_u32 s7, 11
; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1
-; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 8
-; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_cndmask_b32_e32 v13, v15, v12, vcc
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 9
+; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_cmp_eq_u32 s7, 8
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3]
+; VI-NEXT: s_cmp_eq_u32 s7, 9
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 6
-; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc
+; VI-NEXT: s_cmp_eq_u32 s7, 6
+; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 7
+; VI-NEXT: s_cmp_eq_u32 s7, 7
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1]
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 4
-; VI-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc
+; VI-NEXT: s_cmp_eq_u32 s7, 4
+; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 5
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: s_cmp_eq_u32 s7, 5
+; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6
; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 2
-; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_cndmask_b32_e32 v13, v18, v12, vcc
+; VI-NEXT: s_cmp_eq_u32 s7, 2
+; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 3
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: s_cmp_eq_u32 s7, 3
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 0
-; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_cndmask_b32_e32 v13, v19, v12, vcc
+; VI-NEXT: s_cmp_eq_u32 s7, 0
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 1
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; VI-NEXT: s_cmp_eq_u32 s7, 1
+; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4
; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_or_b32_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
@@ -1542,16 +1542,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc
; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
; GFX900-NEXT: s_cmp_eq_u32 s5, 0
-; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2
-; GFX900-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc
; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
; GFX900-NEXT: s_cmp_eq_u32 s5, 1
; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v1
; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
; GFX900-NEXT: s_cmp_eq_u32 s5, 14
-; GFX900-NEXT: v_perm_b32 v2, v10, v2, s2
-; GFX900-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc
; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
; GFX900-NEXT: s_cmp_eq_u32 s5, 15
; GFX900-NEXT: s_waitcnt vmcnt(0)
@@ -1559,30 +1557,32 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
; GFX900-NEXT: s_cmp_eq_u32 s5, 12
-; GFX900-NEXT: v_perm_b32 v1, v10, v1, s2
-; GFX900-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc
+; GFX900-NEXT: v_perm_b32 v1, v12, v1, s2
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc
; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
; GFX900-NEXT: s_cmp_eq_u32 s5, 13
; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v7
; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
; GFX900-NEXT: s_cmp_eq_u32 s5, 10
-; GFX900-NEXT: v_perm_b32 v8, v10, v8, s2
-; GFX900-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc
+; GFX900-NEXT: v_perm_b32 v8, v12, v8, s2
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc
; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
; GFX900-NEXT: s_cmp_eq_u32 s5, 11
-; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v6
; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
; GFX900-NEXT: s_cmp_eq_u32 s5, 8
-; GFX900-NEXT: v_perm_b32 v7, v10, v7, s2
-; GFX900-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc
; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
; GFX900-NEXT: s_cmp_eq_u32 s5, 9
-; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v5
+; GFX900-NEXT: v_perm_b32 v2, v11, v2, s2
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v5
; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX900-NEXT: v_perm_b32 v7, v12, v7, s2
; GFX900-NEXT: v_perm_b32 v6, v10, v6, s2
; GFX900-NEXT: v_perm_b32 v5, v9, v5, s2
; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index d09af8fd2ac954..12b4b2b372ef8e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -2794,16 +2794,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s5, 0
-; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s5, 1
; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s5, 14
-; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s5, 15
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -2811,30 +2809,32 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s5, 12
-; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc
+; GFX9-NEXT: v_perm_b32 v1, v12, v1, s2
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s5, 13
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s5, 10
-; GFX9-NEXT: v_perm_b32 v8, v10, v8, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc
+; GFX9-NEXT: v_perm_b32 v8, v12, v8, s2
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s5, 11
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v6
; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s5, 8
-; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s5, 9
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5
+; GFX9-NEXT: v_perm_b32 v2, v11, v2, s2
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX9-NEXT: v_perm_b32 v7, v12, v7, s2
; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2
; GFX9-NEXT: v_perm_b32 v5, v9, v5, s2
; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
@@ -2844,7 +2844,7 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; VI-LABEL: v_insertelement_v16f16_dynamic:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
@@ -2858,81 +2858,81 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
-; VI-NEXT: s_cmp_eq_u32 s5, 14
+; VI-NEXT: s_cmp_eq_u32 s7, 14
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
-; VI-NEXT: v_mov_b32_e32 v12, s4
+; VI-NEXT: v_mov_b32_e32 v12, s6
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 15
+; VI-NEXT: s_cmp_eq_u32 s7, 15
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 12
-; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 13
+; VI-NEXT: s_cmp_eq_u32 s7, 12
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_eq_u32 s7, 13
; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 10
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 11
+; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1]
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_eq_u32 s7, 10
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_cmp_eq_u32 s7, 11
; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1
-; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 8
-; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_cndmask_b32_e32 v13, v15, v12, vcc
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 9
+; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_cmp_eq_u32 s7, 8
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3]
+; VI-NEXT: s_cmp_eq_u32 s7, 9
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 6
-; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc
+; VI-NEXT: s_cmp_eq_u32 s7, 6
+; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 7
+; VI-NEXT: s_cmp_eq_u32 s7, 7
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1]
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 4
-; VI-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc
+; VI-NEXT: s_cmp_eq_u32 s7, 4
+; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 5
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: s_cmp_eq_u32 s7, 5
+; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6
; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 2
-; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_cndmask_b32_e32 v13, v18, v12, vcc
+; VI-NEXT: s_cmp_eq_u32 s7, 2
+; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 3
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: s_cmp_eq_u32 s7, 3
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 0
-; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_cndmask_b32_e32 v13, v19, v12, vcc
+; VI-NEXT: s_cmp_eq_u32 s7, 0
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s5, 1
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; VI-NEXT: s_cmp_eq_u32 s7, 1
+; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4
; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_or_b32_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
@@ -2965,101 +2965,101 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9
; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1]
; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 11
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc
+; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3]
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 10
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1]
; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: s_cselect_b64 vcc, -1, 0
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7
+; CI-NEXT: v_or_b32_e32 v9, v9, v12
+; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; CI-NEXT: v_or_b32_e32 v8, v8, v12
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v15
; CI-NEXT: s_cmp_eq_u32 s5, 9
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 8
-; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v16
+; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 7
-; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 6
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT: v_cndmask_b32_e32 v15, v15, v6, vcc
+; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 5
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 4
-; CI-NEXT: v_or_b32_e32 v10, v10, v11
-; CI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
-; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3]
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1]
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT: v_or_b32_e32 v2, v2, v11
-; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; CI-NEXT: s_cmp_eq_u32 s5, 3
+; CI-NEXT: v_or_b32_e32 v10, v10, v11
+; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; CI-NEXT: v_or_b32_e32 v7, v7, v12
+; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: v_or_b32_e32 v9, v9, v12
+; CI-NEXT: v_or_b32_e32 v3, v3, v12
; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_or_b32_e32 v2, v2, v12
+; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; CI-NEXT: s_cmp_eq_u32 s5, 3
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 2
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_or_b32_e32 v7, v7, v12
-; CI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc
+; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 1
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 0
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12
-; CI-NEXT: v_or_b32_e32 v8, v8, v13
-; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v15
-; CI-NEXT: v_or_b32_e32 v1, v1, v6
; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11
-; CI-NEXT: v_or_b32_e32 v3, v3, v13
+; CI-NEXT: v_or_b32_e32 v1, v1, v6
+; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12
; CI-NEXT: v_or_b32_e32 v0, v0, v6
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 26a4ea9d8a4b6e..edf900a50cd4b4 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -5413,33 +5413,33 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v8, v13, vcc
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v14
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v15, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v5, v[1:2]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v16, v7, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v4, v[14:15]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v6, v[16:17]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc
-; GFX7-GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v2
+; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v13, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc
; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7
; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v12
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[2:3]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
@@ -5518,33 +5518,33 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v8, v13, vcc
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v14
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v15, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v5, v[1:2]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v16, v7, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v4, v[14:15]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v6, v[16:17]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc
-; GFX8-GISEL-NEXT: v_add_u32_e32 v13, vcc, 1, v2
+; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v13, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc
; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7
; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v12
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[2:3]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
@@ -5615,33 +5615,33 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12
; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v8, v13, vcc
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc
; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v14
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v9, v15, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v5, v[1:2]
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v16, v7, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v4, v[14:15]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v6, v[16:17]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0
; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v13, vcc, 1, v2
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6
; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v13, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc
; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10
; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7
; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v12
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13
; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[2:3]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc
; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
index dd478f94e1039e..98552de05c8572 100644
--- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
+++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
@@ -45,6 +45,10 @@ body: |
; GCN-NEXT: [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_CVT_F64_I32_e32_15:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY15]], implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_CVT_F64_I32_e32_16:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY16]], implicit $mode, implicit $exec
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
@@ -64,14 +68,10 @@ body: |
; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_10]], implicit $exec
; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_11]], implicit $exec
; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_12]], implicit $exec
- ; GCN-NEXT: [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_13]], implicit $exec
- ; GCN-NEXT: [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_14]], implicit $exec
- ; GCN-NEXT: [[V_CVT_F64_I32_e32_15:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY15]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_15]], implicit $exec
- ; GCN-NEXT: [[V_CVT_F64_I32_e32_16:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY16]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_16]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_13]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_14]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_15]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_16]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_17:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY17]], implicit $mode, implicit $exec
; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_17]], implicit $exec
; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 1d0367db701436..4532571d5cf2a1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -2059,207 +2059,207 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
; GFX7-LABEL: v_maximum_v16f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v0, v16
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v22
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v6, v16
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v23
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v7, v16
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v24
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17
; GFX7-NEXT: v_max_f32_e32 v1, v1, v17
; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v16
-; GFX7-NEXT: v_max_f32_e32 v8, v8, v16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v25
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v17
; GFX7-NEXT: v_max_f32_e32 v2, v2, v17
; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v19
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v16
-; GFX7-NEXT: v_max_f32_e32 v9, v9, v16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v26
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v17
; GFX7-NEXT: v_max_f32_e32 v3, v3, v17
; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v20
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v28
; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v16
-; GFX7-NEXT: v_max_f32_e32 v10, v10, v16
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v17
; GFX7-NEXT: v_max_f32_e32 v4, v4, v17
; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v21
-; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v28
-; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v29
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v30
-; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v18
+; GFX7-NEXT: v_max_f32_e32 v12, v12, v18
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v29
; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v17
; GFX7-NEXT: v_max_f32_e32 v5, v5, v17
-; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v27
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v22
+; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v13
; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
-; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v20
+; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v18, v16
+; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v17
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v23
+; GFX7-NEXT: v_max_f32_e32 v16, v18, v16
+; GFX7-NEXT: v_max_f32_e32 v18, v13, v0
+; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v15
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v30
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v17
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v24
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_mov_b32_e32 v19, 0x7fc00000
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v13, v19, v16, s[26:27]
+; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v17
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v25
+; GFX7-NEXT: v_max_f32_e32 v16, v14, v15
+; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v14, v15
+; GFX7-NEXT: v_cndmask_b32_e32 v14, v19, v16, vcc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v19, v2, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[6:7]
+; GFX7-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[8:9]
+; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v17
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v26
+; GFX7-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[10:11]
+; GFX7-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[12:13]
+; GFX7-NEXT: v_cndmask_b32_e64 v7, v19, v7, s[14:15]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cndmask_b32_e64 v8, v19, v8, s[16:17]
+; GFX7-NEXT: v_cndmask_b32_e64 v9, v19, v9, s[18:19]
+; GFX7-NEXT: v_cndmask_b32_e64 v12, v19, v12, s[24:25]
+; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v17
+; GFX7-NEXT: v_max_f32_e32 v10, v10, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v27
+; GFX7-NEXT: v_cndmask_b32_e64 v10, v19, v10, s[20:21]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v17
; GFX7-NEXT: v_max_f32_e32 v11, v11, v17
-; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000
-; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v12, v20
-; GFX7-NEXT: v_max_f32_e32 v12, v12, v20
-; GFX7-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc
-; GFX7-NEXT: v_max_f32_e32 v20, v13, v19
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v13, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v13, v17, v20, vcc
-; GFX7-NEXT: v_max_f32_e32 v19, v14, v18
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v14, v18
-; GFX7-NEXT: v_cndmask_b32_e32 v14, v17, v19, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[12:13]
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7]
-; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9]
-; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11]
-; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[14:15]
-; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[16:17]
-; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19]
-; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21]
-; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23]
-; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25]
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GFX7-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[22:23]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_max_f32_e32 v18, v15, v16
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v19, v18, s[28:29]
+; GFX7-NEXT: v_max_f32_e32 v15, v20, v17
+; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v20, v17
+; GFX7-NEXT: v_cndmask_b32_e32 v15, v19, v15, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_v16f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v15
-; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT: v_max_f16_e32 v18, v17, v16
-; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v17, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v18, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14
; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6
-; GFX8-NEXT: v_max_f16_e32 v20, v18, v17
+; GFX8-NEXT: v_max_f16_e32 v16, v18, v17
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v5
-; GFX8-NEXT: v_max_f16_e32 v21, v20, v18
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v20, v18
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v4
-; GFX8-NEXT: v_max_f16_e32 v22, v21, v20
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v21, v20
-; GFX8-NEXT: v_cndmask_b32_e32 v20, v19, v22, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v11
-; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v3
-; GFX8-NEXT: v_max_f16_e32 v23, v22, v21
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v22, v21
-; GFX8-NEXT: v_cndmask_b32_e32 v21, v19, v23, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v2
-; GFX8-NEXT: v_max_f16_e32 v24, v23, v22
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v23, v22
-; GFX8-NEXT: v_cndmask_b32_e32 v22, v19, v24, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v1
-; GFX8-NEXT: v_max_f16_e32 v25, v24, v23
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v24, v23
-; GFX8-NEXT: v_cndmask_b32_e32 v23, v19, v25, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v0
-; GFX8-NEXT: v_max_f16_e32 v26, v25, v24
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v25, v24
-; GFX8-NEXT: v_cndmask_b32_e32 v24, v19, v26, vcc
-; GFX8-NEXT: v_max_f16_e32 v25, v7, v15
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v15
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v19, v25, vcc
-; GFX8-NEXT: v_max_f16_e32 v15, v6, v14
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v14
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v19, v15, vcc
-; GFX8-NEXT: v_max_f16_e32 v14, v5, v13
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v13
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v19, v14, vcc
-; GFX8-NEXT: v_max_f16_e32 v13, v4, v12
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v4, v12
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v19, v13, vcc
-; GFX8-NEXT: v_max_f16_e32 v12, v3, v11
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v11
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v12, vcc
-; GFX8-NEXT: v_max_f16_e32 v11, v2, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5
+; GFX8-NEXT: v_max_f16_e32 v20, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4
+; GFX8-NEXT: v_max_f16_e32 v21, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3
+; GFX8-NEXT: v_max_f16_e32 v22, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2
+; GFX8-NEXT: v_max_f16_e32 v23, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_max_f16_e32 v24, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX8-NEXT: v_max_f16_e32 v25, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17
+; GFX8-NEXT: v_max_f16_e32 v17, v6, v14
+; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14
+; GFX8-NEXT: v_max_f16_e32 v6, v5, v13
+; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13
+; GFX8-NEXT: v_max_f16_e32 v5, v4, v12
+; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12
+; GFX8-NEXT: v_max_f16_e32 v4, v3, v11
+; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11
+; GFX8-NEXT: v_max_f16_e32 v11, v7, v15
+; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00
+; GFX8-NEXT: v_max_f16_e32 v13, v7, v12
+; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12
+; GFX8-NEXT: v_max_f16_e32 v3, v2, v10
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27]
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v19, v16, vcc
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v11, vcc
-; GFX8-NEXT: v_max_f16_e32 v10, v1, v9
+; GFX8-NEXT: v_max_f16_e32 v14, v1, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v10, vcc
-; GFX8-NEXT: v_max_f16_e32 v9, v0, v8
+; GFX8-NEXT: v_max_f16_e32 v7, v0, v8
+; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9]
+; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v14, vcc
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v9, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v24
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v23
-; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v22
-; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v21
-; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v20
-; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v18
-; GFX8-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v17
-; GFX8-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v16
-; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22
+; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11]
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23]
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21]
+; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19]
+; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25]
+; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17]
+; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v12
+; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_maximum_v16f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index df7355c2c57bfa..584dd2700c419a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -1730,20 +1730,20 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX7-NEXT: v_writelane_b32 v31, s30, 0
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17
; GFX7-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GFX7-NEXT: v_writelane_b32 v31, s30, 0
+; GFX7-NEXT: v_writelane_b32 v31, s31, 1
; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18
; GFX7-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000
-; GFX7-NEXT: v_max_f32_e32 v18, v13, v29
-; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29
-; GFX7-NEXT: v_writelane_b32 v31, s31, 1
; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19
; GFX7-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX7-NEXT: v_mov_b32_e32 v18, 0x7fc00000
+; GFX7-NEXT: v_max_f32_e32 v19, v0, v16
+; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16
+; GFX7-NEXT: v_max_f32_e32 v16, v14, v30
+; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20
; GFX7-NEXT: v_max_f32_e32 v4, v4, v20
; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1752,39 +1752,39 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23
; GFX7-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24
+; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24
; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
-; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25
+; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25
; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26
+; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26
; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27
+; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27
; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
-; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28
+; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28
; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_max_f32_e32 v19, v14, v30
-; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29]
-; GFX7-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17]
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7]
-; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9]
-; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11]
-; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13]
-; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15]
-; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19]
-; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21]
-; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23]
-; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25]
-; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27]
-; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31]
+; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29
+; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29]
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7]
+; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9]
+; GFX7-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11]
+; GFX7-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13]
+; GFX7-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15]
+; GFX7-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17]
+; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19]
+; GFX7-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21]
+; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23]
+; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25]
+; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27]
; GFX7-NEXT: v_readlane_b32 s31, v31, 1
; GFX7-NEXT: v_readlane_b32 s30, v31, 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f32_e32 v18, v15, v16
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX7-NEXT: v_max_f32_e32 v16, v15, v17
+; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17
+; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
@@ -1797,20 +1797,20 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16
-; GFX8-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX8-NEXT: v_writelane_b32 v31, s30, 0
; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17
; GFX8-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT: v_writelane_b32 v31, s30, 0
+; GFX8-NEXT: v_writelane_b32 v31, s31, 1
; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18
; GFX8-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000
-; GFX8-NEXT: v_max_f32_e32 v18, v13, v29
-; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29
-; GFX8-NEXT: v_writelane_b32 v31, s31, 1
; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19
; GFX8-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX8-NEXT: v_mov_b32_e32 v18, 0x7fc00000
+; GFX8-NEXT: v_max_f32_e32 v19, v0, v16
+; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16
+; GFX8-NEXT: v_max_f32_e32 v16, v14, v30
+; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20
; GFX8-NEXT: v_max_f32_e32 v4, v4, v20
; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1819,39 +1819,39 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX8-NEXT: v_max_f32_e32 v6, v6, v22
; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23
; GFX8-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24
+; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24
; GFX8-NEXT: v_max_f32_e32 v8, v8, v24
-; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25
+; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25
; GFX8-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26
+; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26
; GFX8-NEXT: v_max_f32_e32 v10, v10, v26
-; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27
+; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27
; GFX8-NEXT: v_max_f32_e32 v11, v11, v27
-; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28
+; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28
; GFX8-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX8-NEXT: v_max_f32_e32 v19, v14, v30
-; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7]
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11]
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13]
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15]
-; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19]
-; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21]
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23]
-; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25]
-; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27]
-; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31]
+; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29
+; GFX8-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23]
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27]
; GFX8-NEXT: v_readlane_b32 s31, v31, 1
; GFX8-NEXT: v_readlane_b32 s30, v31, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f32_e32 v18, v15, v16
-; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX8-NEXT: v_max_f32_e32 v16, v15, v17
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
@@ -1864,20 +1864,20 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16
-; GFX900-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX900-NEXT: v_writelane_b32 v31, s30, 0
; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17
; GFX900-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GFX900-NEXT: v_writelane_b32 v31, s30, 0
+; GFX900-NEXT: v_writelane_b32 v31, s31, 1
; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18
; GFX900-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000
-; GFX900-NEXT: v_max_f32_e32 v18, v13, v29
-; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29
-; GFX900-NEXT: v_writelane_b32 v31, s31, 1
; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19
; GFX900-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX900-NEXT: v_mov_b32_e32 v18, 0x7fc00000
+; GFX900-NEXT: v_max_f32_e32 v19, v0, v16
+; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16
+; GFX900-NEXT: v_max_f32_e32 v16, v14, v30
+; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20
; GFX900-NEXT: v_max_f32_e32 v4, v4, v20
; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1886,39 +1886,39 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX900-NEXT: v_max_f32_e32 v6, v6, v22
; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23
; GFX900-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24
+; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24
; GFX900-NEXT: v_max_f32_e32 v8, v8, v24
-; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25
+; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25
; GFX900-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26
+; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26
; GFX900-NEXT: v_max_f32_e32 v10, v10, v26
-; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27
+; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27
; GFX900-NEXT: v_max_f32_e32 v11, v11, v27
-; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28
+; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28
; GFX900-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX900-NEXT: v_max_f32_e32 v19, v14, v30
-; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29]
-; GFX900-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17]
-; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5]
-; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7]
-; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9]
-; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11]
-; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13]
-; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15]
-; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19]
-; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21]
-; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23]
-; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25]
-; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27]
-; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31]
+; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29
+; GFX900-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29]
+; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7]
+; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9]
+; GFX900-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11]
+; GFX900-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13]
+; GFX900-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15]
+; GFX900-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17]
+; GFX900-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19]
+; GFX900-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21]
+; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23]
+; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25]
+; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27]
; GFX900-NEXT: v_readlane_b32 s31, v31, 1
; GFX900-NEXT: v_readlane_b32 s30, v31, 0
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_max_f32_e32 v18, v15, v16
-; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16
-; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT: v_max_f32_e32 v16, v15, v17
+; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index f8c2c54af27830..0b9cb9682ea5f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -1598,87 +1598,87 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
; GFX8-LABEL: v_minimum_v16f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v15
-; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT: v_min_f16_e32 v18, v17, v16
-; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v17, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v18, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14
; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6
-; GFX8-NEXT: v_min_f16_e32 v20, v18, v17
+; GFX8-NEXT: v_min_f16_e32 v16, v18, v17
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v5
-; GFX8-NEXT: v_min_f16_e32 v21, v20, v18
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v20, v18
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v4
-; GFX8-NEXT: v_min_f16_e32 v22, v21, v20
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v21, v20
-; GFX8-NEXT: v_cndmask_b32_e32 v20, v19, v22, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v11
-; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v3
-; GFX8-NEXT: v_min_f16_e32 v23, v22, v21
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v22, v21
-; GFX8-NEXT: v_cndmask_b32_e32 v21, v19, v23, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v2
-; GFX8-NEXT: v_min_f16_e32 v24, v23, v22
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v23, v22
-; GFX8-NEXT: v_cndmask_b32_e32 v22, v19, v24, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v1
-; GFX8-NEXT: v_min_f16_e32 v25, v24, v23
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v24, v23
-; GFX8-NEXT: v_cndmask_b32_e32 v23, v19, v25, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v0
-; GFX8-NEXT: v_min_f16_e32 v26, v25, v24
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v25, v24
-; GFX8-NEXT: v_cndmask_b32_e32 v24, v19, v26, vcc
-; GFX8-NEXT: v_min_f16_e32 v25, v7, v15
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v15
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v19, v25, vcc
-; GFX8-NEXT: v_min_f16_e32 v15, v6, v14
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v14
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v19, v15, vcc
-; GFX8-NEXT: v_min_f16_e32 v14, v5, v13
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v13
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v19, v14, vcc
-; GFX8-NEXT: v_min_f16_e32 v13, v4, v12
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v4, v12
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v19, v13, vcc
-; GFX8-NEXT: v_min_f16_e32 v12, v3, v11
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v11
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v12, vcc
-; GFX8-NEXT: v_min_f16_e32 v11, v2, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5
+; GFX8-NEXT: v_min_f16_e32 v20, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4
+; GFX8-NEXT: v_min_f16_e32 v21, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3
+; GFX8-NEXT: v_min_f16_e32 v22, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2
+; GFX8-NEXT: v_min_f16_e32 v23, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_min_f16_e32 v24, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX8-NEXT: v_min_f16_e32 v25, v18, v17
+; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17
+; GFX8-NEXT: v_min_f16_e32 v17, v6, v14
+; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14
+; GFX8-NEXT: v_min_f16_e32 v6, v5, v13
+; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13
+; GFX8-NEXT: v_min_f16_e32 v5, v4, v12
+; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12
+; GFX8-NEXT: v_min_f16_e32 v4, v3, v11
+; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11
+; GFX8-NEXT: v_min_f16_e32 v11, v7, v15
+; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00
+; GFX8-NEXT: v_min_f16_e32 v13, v7, v12
+; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12
+; GFX8-NEXT: v_min_f16_e32 v3, v2, v10
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27]
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v19, v16, vcc
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v11, vcc
-; GFX8-NEXT: v_min_f16_e32 v10, v1, v9
+; GFX8-NEXT: v_min_f16_e32 v14, v1, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v10, vcc
-; GFX8-NEXT: v_min_f16_e32 v9, v0, v8
+; GFX8-NEXT: v_min_f16_e32 v7, v0, v8
+; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9]
+; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v14, vcc
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v9, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v24
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v23
-; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v22
-; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v21
-; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v20
-; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v18
-; GFX8-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v17
-; GFX8-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v16
-; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22
+; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11]
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23]
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21]
+; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19]
+; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25]
+; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17]
+; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v12
+; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_minimum_v16f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 956de6de3aad3b..99624331340730 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -1730,20 +1730,20 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX7-NEXT: v_writelane_b32 v31, s30, 0
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17
; GFX7-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GFX7-NEXT: v_writelane_b32 v31, s30, 0
+; GFX7-NEXT: v_writelane_b32 v31, s31, 1
; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18
; GFX7-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000
-; GFX7-NEXT: v_min_f32_e32 v18, v13, v29
-; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29
-; GFX7-NEXT: v_writelane_b32 v31, s31, 1
; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19
; GFX7-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX7-NEXT: v_mov_b32_e32 v18, 0x7fc00000
+; GFX7-NEXT: v_min_f32_e32 v19, v0, v16
+; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16
+; GFX7-NEXT: v_min_f32_e32 v16, v14, v30
+; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20
; GFX7-NEXT: v_min_f32_e32 v4, v4, v20
; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1752,39 +1752,39 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23
; GFX7-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24
+; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24
; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
-; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25
+; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25
; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26
+; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26
; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27
+; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27
; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
-; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28
+; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28
; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_min_f32_e32 v19, v14, v30
-; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29]
-; GFX7-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17]
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7]
-; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9]
-; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11]
-; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13]
-; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15]
-; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19]
-; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21]
-; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23]
-; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25]
-; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27]
-; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31]
+; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29
+; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29]
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7]
+; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9]
+; GFX7-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11]
+; GFX7-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13]
+; GFX7-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15]
+; GFX7-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17]
+; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19]
+; GFX7-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21]
+; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23]
+; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25]
+; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27]
; GFX7-NEXT: v_readlane_b32 s31, v31, 1
; GFX7-NEXT: v_readlane_b32 s30, v31, 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_min_f32_e32 v18, v15, v16
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX7-NEXT: v_min_f32_e32 v16, v15, v17
+; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17
+; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
@@ -1797,20 +1797,20 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16
-; GFX8-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX8-NEXT: v_writelane_b32 v31, s30, 0
; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17
; GFX8-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT: v_writelane_b32 v31, s30, 0
+; GFX8-NEXT: v_writelane_b32 v31, s31, 1
; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18
; GFX8-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000
-; GFX8-NEXT: v_min_f32_e32 v18, v13, v29
-; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29
-; GFX8-NEXT: v_writelane_b32 v31, s31, 1
; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19
; GFX8-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX8-NEXT: v_mov_b32_e32 v18, 0x7fc00000
+; GFX8-NEXT: v_min_f32_e32 v19, v0, v16
+; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16
+; GFX8-NEXT: v_min_f32_e32 v16, v14, v30
+; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20
; GFX8-NEXT: v_min_f32_e32 v4, v4, v20
; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1819,39 +1819,39 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX8-NEXT: v_min_f32_e32 v6, v6, v22
; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23
; GFX8-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24
+; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24
; GFX8-NEXT: v_min_f32_e32 v8, v8, v24
-; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25
+; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25
; GFX8-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26
+; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26
; GFX8-NEXT: v_min_f32_e32 v10, v10, v26
-; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27
+; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27
; GFX8-NEXT: v_min_f32_e32 v11, v11, v27
-; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28
+; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28
; GFX8-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX8-NEXT: v_min_f32_e32 v19, v14, v30
-; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7]
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11]
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13]
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15]
-; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19]
-; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21]
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23]
-; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25]
-; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27]
-; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31]
+; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29
+; GFX8-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23]
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27]
; GFX8-NEXT: v_readlane_b32 s31, v31, 1
; GFX8-NEXT: v_readlane_b32 s30, v31, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_min_f32_e32 v18, v15, v16
-; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX8-NEXT: v_min_f32_e32 v16, v15, v17
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
@@ -1864,20 +1864,20 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16
-; GFX900-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX900-NEXT: v_writelane_b32 v31, s30, 0
; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17
; GFX900-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GFX900-NEXT: v_writelane_b32 v31, s30, 0
+; GFX900-NEXT: v_writelane_b32 v31, s31, 1
; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18
; GFX900-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000
-; GFX900-NEXT: v_min_f32_e32 v18, v13, v29
-; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29
-; GFX900-NEXT: v_writelane_b32 v31, s31, 1
; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19
; GFX900-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX900-NEXT: v_mov_b32_e32 v18, 0x7fc00000
+; GFX900-NEXT: v_min_f32_e32 v19, v0, v16
+; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16
+; GFX900-NEXT: v_min_f32_e32 v16, v14, v30
+; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20
; GFX900-NEXT: v_min_f32_e32 v4, v4, v20
; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21
@@ -1886,39 +1886,39 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX900-NEXT: v_min_f32_e32 v6, v6, v22
; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23
; GFX900-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24
+; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24
; GFX900-NEXT: v_min_f32_e32 v8, v8, v24
-; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25
+; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25
; GFX900-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26
+; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26
; GFX900-NEXT: v_min_f32_e32 v10, v10, v26
-; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27
+; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27
; GFX900-NEXT: v_min_f32_e32 v11, v11, v27
-; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28
+; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28
; GFX900-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX900-NEXT: v_min_f32_e32 v19, v14, v30
-; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29]
-; GFX900-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17]
-; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5]
-; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7]
-; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9]
-; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11]
-; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13]
-; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15]
-; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19]
-; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21]
-; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23]
-; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25]
-; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27]
-; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31]
+; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29
+; GFX900-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31]
+; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29]
+; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7]
+; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9]
+; GFX900-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11]
+; GFX900-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13]
+; GFX900-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15]
+; GFX900-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17]
+; GFX900-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19]
+; GFX900-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21]
+; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23]
+; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25]
+; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27]
; GFX900-NEXT: v_readlane_b32 s31, v31, 1
; GFX900-NEXT: v_readlane_b32 s30, v31, 0
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_min_f32_e32 v18, v15, v16
-; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16
-; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT: v_min_f32_e32 v16, v15, v17
+; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index c735854a455905..b378d69fb842ff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -574,84 +574,85 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
; CI-LABEL: round_v8f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; CI-NEXT: s_brev_b32 s2, -2
+; CI-NEXT: s_brev_b32 s6, -2
; CI-NEXT: v_mov_b32_e32 v4, 0
-; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11]
; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9]
; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1]
+; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[6:7]
+; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5
+; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[8:9]|, 0.5
+; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CI-NEXT: s_cselect_b32 s7, 0x3ff00000, 0
; CI-NEXT: v_mov_b32_e32 v5, s11
-; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5
-; CI-NEXT: v_add_f64 v[2:3], s[8:9], -v[6:7]
-; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0
-; CI-NEXT: v_mov_b32_e32 v8, s4
-; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5
-; CI-NEXT: v_bfi_b32 v5, s2, v8, v5
-; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; CI-NEXT: s_and_b64 s[0:1], s[2:3], exec
+; CI-NEXT: v_mov_b32_e32 v2, s7
; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15]
-; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0
+; CI-NEXT: v_bfi_b32 v5, s6, v2, v5
+; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5]
-; CI-NEXT: v_mov_b32_e32 v5, s4
+; CI-NEXT: v_mov_b32_e32 v5, s0
; CI-NEXT: v_mov_b32_e32 v10, s9
; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9]
-; CI-NEXT: v_bfi_b32 v5, s2, v5, v10
-; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5
+; CI-NEXT: v_bfi_b32 v5, s6, v5, v10
+; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5
; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5]
; CI-NEXT: v_trunc_f64_e32 v[6:7], s[12:13]
-; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[6:7]
-; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0
-; CI-NEXT: v_mov_b32_e32 v5, s4
-; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5
+; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
+; CI-NEXT: v_mov_b32_e32 v5, s0
+; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5
; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19]
; CI-NEXT: v_mov_b32_e32 v12, s15
-; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; CI-NEXT: v_bfi_b32 v5, s2, v5, v12
-; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0
+; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CI-NEXT: v_bfi_b32 v5, s6, v5, v12
+; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
; CI-NEXT: v_add_f64 v[12:13], s[18:19], -v[10:11]
; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5]
-; CI-NEXT: v_mov_b32_e32 v5, s4
+; CI-NEXT: v_mov_b32_e32 v5, s0
; CI-NEXT: v_mov_b32_e32 v14, s13
-; CI-NEXT: v_bfi_b32 v5, s2, v5, v14
-; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5
+; CI-NEXT: v_bfi_b32 v5, s6, v5, v14
+; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5
; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17]
-; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CI-NEXT: v_add_f64 v[12:13], s[16:17], -v[14:15]
-; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0
+; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5]
-; CI-NEXT: v_mov_b32_e32 v5, s4
-; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5
+; CI-NEXT: v_mov_b32_e32 v5, s0
+; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5
; CI-NEXT: v_mov_b32_e32 v16, s19
-; CI-NEXT: v_bfi_b32 v5, s2, v5, v16
-; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CI-NEXT: v_bfi_b32 v5, s6, v5, v16
+; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23]
-; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0
; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5]
-; CI-NEXT: v_mov_b32_e32 v5, s4
-; CI-NEXT: v_mov_b32_e32 v18, s17
-; CI-NEXT: v_add_f64 v[10:11], s[22:23], -v[16:17]
-; CI-NEXT: v_bfi_b32 v5, s2, v5, v18
-; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5
+; CI-NEXT: v_mov_b32_e32 v5, s0
+; CI-NEXT: v_mov_b32_e32 v10, s17
+; CI-NEXT: v_bfi_b32 v5, s6, v5, v10
+; CI-NEXT: v_add_f64 v[18:19], s[22:23], -v[16:17]
; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5]
; CI-NEXT: v_trunc_f64_e32 v[14:15], s[20:21]
-; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5
; CI-NEXT: v_add_f64 v[18:19], s[20:21], -v[14:15]
-; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0
-; CI-NEXT: v_mov_b32_e32 v5, s4
-; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[18:19]|, 0.5
-; CI-NEXT: v_mov_b32_e32 v20, s23
-; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; CI-NEXT: v_bfi_b32 v5, s2, v5, v20
-; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0
+; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5
+; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
+; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CI-NEXT: v_mov_b32_e32 v5, s2
+; CI-NEXT: v_mov_b32_e32 v18, s23
+; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
+; CI-NEXT: v_bfi_b32 v5, s6, v5, v18
+; CI-NEXT: v_mov_b32_e32 v18, s0
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT: v_mov_b32_e32 v19, s21
; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5]
-; CI-NEXT: v_mov_b32_e32 v5, s4
-; CI-NEXT: v_mov_b32_e32 v18, s21
-; CI-NEXT: v_bfi_b32 v5, s2, v5, v18
+; CI-NEXT: v_bfi_b32 v5, s6, v18, v19
; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5]
+; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48
; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32
; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index c1ab63b8160c6a..223870950e4b78 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -1772,42 +1772,42 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: flat_load_ushort v12, v[0:1]
+; GFX8-NEXT: flat_load_ushort v18, v[0:1]
; GFX8-NEXT: s_add_u32 s2, s0, 48
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
+; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: v_mov_b32_e32 v8, s2
; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
; GFX8-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v15, s3
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: v_mov_b32_e32 v23, s1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: v_mov_b32_e32 v22, s0
+; GFX8-NEXT: v_mov_b32_e32 v14, s2
+; GFX8-NEXT: v_mov_b32_e32 v17, s1
+; GFX8-NEXT: v_mov_b32_e32 v16, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_bfe_i32 v3, v12, 3, 1
-; GFX8-NEXT: v_bfe_i32 v2, v12, 2, 1
-; GFX8-NEXT: v_bfe_i32 v1, v12, 1, 1
-; GFX8-NEXT: v_bfe_i32 v0, v12, 0, 1
-; GFX8-NEXT: v_bfe_i32 v7, v12, 7, 1
-; GFX8-NEXT: v_bfe_i32 v6, v12, 6, 1
-; GFX8-NEXT: v_bfe_i32 v5, v12, 5, 1
-; GFX8-NEXT: v_bfe_i32 v4, v12, 4, 1
-; GFX8-NEXT: v_bfe_i32 v11, v12, 11, 1
-; GFX8-NEXT: v_bfe_i32 v10, v12, 10, 1
-; GFX8-NEXT: v_bfe_i32 v9, v12, 9, 1
-; GFX8-NEXT: v_bfe_i32 v8, v12, 8, 1
-; GFX8-NEXT: v_bfe_i32 v15, v12, 15, 1
-; GFX8-NEXT: v_bfe_i32 v14, v12, 14, 1
-; GFX8-NEXT: v_bfe_i32 v13, v12, 13, 1
-; GFX8-NEXT: v_bfe_i32 v12, v12, 12, 1
-; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
-; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
-; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[4:7]
-; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT: v_bfe_i32 v7, v18, 15, 1
+; GFX8-NEXT: v_bfe_i32 v6, v18, 14, 1
+; GFX8-NEXT: v_bfe_i32 v5, v18, 13, 1
+; GFX8-NEXT: v_bfe_i32 v4, v18, 12, 1
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; GFX8-NEXT: v_bfe_i32 v11, v18, 11, 1
+; GFX8-NEXT: v_bfe_i32 v10, v18, 10, 1
+; GFX8-NEXT: v_bfe_i32 v9, v18, 9, 1
+; GFX8-NEXT: v_bfe_i32 v8, v18, 8, 1
+; GFX8-NEXT: v_bfe_i32 v3, v18, 3, 1
+; GFX8-NEXT: v_bfe_i32 v2, v18, 2, 1
+; GFX8-NEXT: v_bfe_i32 v1, v18, 1, 1
+; GFX8-NEXT: v_bfe_i32 v0, v18, 0, 1
+; GFX8-NEXT: v_bfe_i32 v7, v18, 7, 1
+; GFX8-NEXT: v_bfe_i32 v6, v18, 6, 1
+; GFX8-NEXT: v_bfe_i32 v5, v18, 5, 1
+; GFX8-NEXT: v_bfe_i32 v4, v18, 4, 1
+; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
+; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_sextload_v16i1_to_v16i32:
@@ -2707,33 +2707,33 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX6-NEXT: s_bfe_u32 s8, s2, 0x1000b
; GFX6-NEXT: s_bfe_u32 s9, s2, 0x10009
; GFX6-NEXT: s_bfe_u32 s10, s2, 0x1000f
-; GFX6-NEXT: s_bfe_u32 s11, s2, 0x1000d
-; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10013
-; GFX6-NEXT: s_bfe_u32 s13, s2, 0x10011
-; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10017
-; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10015
-; GFX6-NEXT: s_bfe_u32 s16, s2, 0x1001b
-; GFX6-NEXT: s_bfe_u32 s17, s2, 0x10019
-; GFX6-NEXT: s_lshr_b32 s18, s2, 31
-; GFX6-NEXT: s_bfe_u32 s19, s2, 0x1001d
-; GFX6-NEXT: s_bfe_u32 s20, s3, 0x10003
-; GFX6-NEXT: s_bfe_u32 s21, s3, 0x10001
-; GFX6-NEXT: s_bfe_u32 s22, s3, 0x10007
-; GFX6-NEXT: s_bfe_u32 s23, s3, 0x10005
-; GFX6-NEXT: s_bfe_u32 s24, s3, 0x1000b
-; GFX6-NEXT: s_bfe_u32 s25, s3, 0x10009
-; GFX6-NEXT: s_bfe_u32 s26, s3, 0x1000f
-; GFX6-NEXT: s_bfe_u32 s27, s3, 0x1000d
-; GFX6-NEXT: s_bfe_u32 s28, s3, 0x10013
-; GFX6-NEXT: s_bfe_u32 s29, s3, 0x10011
-; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10017
-; GFX6-NEXT: s_bfe_u32 s31, s3, 0x10015
-; GFX6-NEXT: s_bfe_u32 s33, s3, 0x1001b
-; GFX6-NEXT: s_bfe_u32 s34, s3, 0x10019
-; GFX6-NEXT: s_lshr_b32 s35, s3, 31
-; GFX6-NEXT: s_bfe_u32 s36, s3, 0x1001d
-; GFX6-NEXT: s_and_b32 s37, s2, 1
-; GFX6-NEXT: s_bfe_u32 s38, s2, 0x10002
+; GFX6-NEXT: s_bfe_u32 s13, s2, 0x1000d
+; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10013
+; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10011
+; GFX6-NEXT: s_bfe_u32 s16, s2, 0x10017
+; GFX6-NEXT: s_bfe_u32 s17, s2, 0x10015
+; GFX6-NEXT: s_bfe_u32 s18, s2, 0x1001b
+; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10019
+; GFX6-NEXT: s_lshr_b32 s20, s2, 31
+; GFX6-NEXT: s_bfe_u32 s21, s2, 0x1001d
+; GFX6-NEXT: s_bfe_u32 s22, s3, 0x10003
+; GFX6-NEXT: s_bfe_u32 s23, s3, 0x10001
+; GFX6-NEXT: s_bfe_u32 s24, s3, 0x10007
+; GFX6-NEXT: s_bfe_u32 s25, s3, 0x10005
+; GFX6-NEXT: s_bfe_u32 s26, s3, 0x1000b
+; GFX6-NEXT: s_bfe_u32 s27, s3, 0x10009
+; GFX6-NEXT: s_bfe_u32 s28, s3, 0x1000f
+; GFX6-NEXT: s_bfe_u32 s29, s3, 0x1000d
+; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10013
+; GFX6-NEXT: s_bfe_u32 s31, s3, 0x10011
+; GFX6-NEXT: s_bfe_u32 s33, s3, 0x10017
+; GFX6-NEXT: s_bfe_u32 s34, s3, 0x10015
+; GFX6-NEXT: s_bfe_u32 s35, s3, 0x1001b
+; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10019
+; GFX6-NEXT: s_lshr_b32 s37, s3, 31
+; GFX6-NEXT: s_bfe_u32 s38, s3, 0x1001d
+; GFX6-NEXT: s_and_b32 s12, s2, 1
+; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10002
; GFX6-NEXT: s_bfe_u32 s39, s2, 0x10006
; GFX6-NEXT: s_bfe_u32 s40, s2, 0x10004
; GFX6-NEXT: s_bfe_u32 s41, s2, 0x1000a
@@ -2752,91 +2752,90 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX6-NEXT: s_bfe_u32 s54, s3, 0x10002
; GFX6-NEXT: s_bfe_u32 s55, s3, 0x10006
; GFX6-NEXT: s_bfe_u32 s56, s3, 0x10004
-; GFX6-NEXT: s_bfe_u32 s57, s3, 0x1000a
-; GFX6-NEXT: s_bfe_u32 s58, s3, 0x10008
-; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000e
+; GFX6-NEXT: s_bfe_u32 s57, s3, 0x10008
+; GFX6-NEXT: s_bfe_u32 s58, s3, 0x1000e
+; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000c
; GFX6-NEXT: s_bfe_u32 s60, s3, 0x10012
; GFX6-NEXT: s_bfe_u32 s61, s3, 0x10010
; GFX6-NEXT: s_bfe_u32 s62, s3, 0x10016
-; GFX6-NEXT: s_bfe_u32 s63, s3, 0x1001a
-; GFX6-NEXT: s_bfe_u32 s64, s3, 0x10018
-; GFX6-NEXT: s_bfe_u32 s65, s3, 0x1001e
-; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001c
-; GFX6-NEXT: s_bfe_u32 s67, s3, 0x10014
-; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1000c
+; GFX6-NEXT: s_bfe_u32 s63, s3, 0x10014
+; GFX6-NEXT: s_bfe_u32 s64, s3, 0x1001a
+; GFX6-NEXT: s_bfe_u32 s65, s3, 0x10018
+; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001e
+; GFX6-NEXT: s_bfe_u32 s67, s3, 0x1001c
+; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1000a
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s66
-; GFX6-NEXT: v_mov_b32_e32 v1, s36
-; GFX6-NEXT: v_mov_b32_e32 v2, s65
-; GFX6-NEXT: v_mov_b32_e32 v3, s35
-; GFX6-NEXT: v_mov_b32_e32 v4, s64
-; GFX6-NEXT: v_mov_b32_e32 v5, s34
-; GFX6-NEXT: v_mov_b32_e32 v6, s63
-; GFX6-NEXT: v_mov_b32_e32 v7, s33
-; GFX6-NEXT: v_mov_b32_e32 v8, s67
-; GFX6-NEXT: v_mov_b32_e32 v9, s31
+; GFX6-NEXT: v_mov_b32_e32 v0, s67
+; GFX6-NEXT: v_mov_b32_e32 v1, s38
+; GFX6-NEXT: v_mov_b32_e32 v2, s66
+; GFX6-NEXT: v_mov_b32_e32 v3, s37
+; GFX6-NEXT: v_mov_b32_e32 v4, s65
+; GFX6-NEXT: v_mov_b32_e32 v5, s36
+; GFX6-NEXT: v_mov_b32_e32 v6, s64
+; GFX6-NEXT: v_mov_b32_e32 v7, s35
+; GFX6-NEXT: v_mov_b32_e32 v8, s63
+; GFX6-NEXT: v_mov_b32_e32 v9, s34
; GFX6-NEXT: v_mov_b32_e32 v10, s62
-; GFX6-NEXT: v_mov_b32_e32 v11, s30
+; GFX6-NEXT: v_mov_b32_e32 v11, s33
; GFX6-NEXT: v_mov_b32_e32 v12, s61
-; GFX6-NEXT: v_mov_b32_e32 v13, s29
+; GFX6-NEXT: v_mov_b32_e32 v13, s31
; GFX6-NEXT: v_mov_b32_e32 v14, s60
+; GFX6-NEXT: v_mov_b32_e32 v15, s30
+; GFX6-NEXT: v_mov_b32_e32 v16, s59
+; GFX6-NEXT: v_mov_b32_e32 v17, s29
+; GFX6-NEXT: v_mov_b32_e32 v18, s58
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s57
+; GFX6-NEXT: v_mov_b32_e32 v19, s28
+; GFX6-NEXT: v_mov_b32_e32 v1, s27
+; GFX6-NEXT: v_mov_b32_e32 v2, s68
+; GFX6-NEXT: v_mov_b32_e32 v3, s26
; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
-; GFX6-NEXT: v_mov_b32_e32 v15, s28
; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GFX6-NEXT: s_waitcnt expcnt(3)
-; GFX6-NEXT: v_mov_b32_e32 v0, s68
-; GFX6-NEXT: v_mov_b32_e32 v1, s27
-; GFX6-NEXT: v_mov_b32_e32 v2, s59
-; GFX6-NEXT: v_mov_b32_e32 v3, s26
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s58
-; GFX6-NEXT: v_mov_b32_e32 v1, s25
-; GFX6-NEXT: v_mov_b32_e32 v2, s57
-; GFX6-NEXT: v_mov_b32_e32 v3, s24
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s56
-; GFX6-NEXT: v_mov_b32_e32 v1, s23
+; GFX6-NEXT: v_mov_b32_e32 v1, s25
; GFX6-NEXT: v_mov_b32_e32 v2, s55
-; GFX6-NEXT: v_mov_b32_e32 v3, s22
+; GFX6-NEXT: v_mov_b32_e32 v3, s24
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s53
-; GFX6-NEXT: v_mov_b32_e32 v1, s21
+; GFX6-NEXT: v_mov_b32_e32 v1, s23
; GFX6-NEXT: v_mov_b32_e32 v2, s54
-; GFX6-NEXT: v_mov_b32_e32 v3, s20
+; GFX6-NEXT: v_mov_b32_e32 v3, s22
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s52
-; GFX6-NEXT: v_mov_b32_e32 v1, s19
+; GFX6-NEXT: v_mov_b32_e32 v1, s21
; GFX6-NEXT: v_mov_b32_e32 v2, s51
-; GFX6-NEXT: v_mov_b32_e32 v3, s18
+; GFX6-NEXT: v_mov_b32_e32 v3, s20
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s50
-; GFX6-NEXT: v_mov_b32_e32 v1, s17
+; GFX6-NEXT: v_mov_b32_e32 v1, s19
; GFX6-NEXT: v_mov_b32_e32 v2, s49
-; GFX6-NEXT: v_mov_b32_e32 v3, s16
+; GFX6-NEXT: v_mov_b32_e32 v3, s18
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s48
-; GFX6-NEXT: v_mov_b32_e32 v1, s15
+; GFX6-NEXT: v_mov_b32_e32 v1, s17
; GFX6-NEXT: v_mov_b32_e32 v2, s47
-; GFX6-NEXT: v_mov_b32_e32 v3, s14
+; GFX6-NEXT: v_mov_b32_e32 v3, s16
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s46
-; GFX6-NEXT: v_mov_b32_e32 v1, s13
+; GFX6-NEXT: v_mov_b32_e32 v1, s15
; GFX6-NEXT: v_mov_b32_e32 v2, s45
-; GFX6-NEXT: v_mov_b32_e32 v3, s12
+; GFX6-NEXT: v_mov_b32_e32 v3, s14
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s44
-; GFX6-NEXT: v_mov_b32_e32 v1, s11
+; GFX6-NEXT: v_mov_b32_e32 v1, s13
; GFX6-NEXT: v_mov_b32_e32 v2, s43
; GFX6-NEXT: v_mov_b32_e32 v3, s10
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
@@ -2853,9 +2852,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s37
+; GFX6-NEXT: v_mov_b32_e32 v0, s12
; GFX6-NEXT: v_mov_b32_e32 v1, s5
-; GFX6-NEXT: v_mov_b32_e32 v2, s38
+; GFX6-NEXT: v_mov_b32_e32 v2, s11
; GFX6-NEXT: v_mov_b32_e32 v3, s4
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -3446,59 +3445,58 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX6-NEXT: s_bfe_i32 s46, s3, 0x1000a
; GFX6-NEXT: s_bfe_i32 s47, s3, 0x10009
; GFX6-NEXT: s_bfe_i32 s48, s3, 0x10008
-; GFX6-NEXT: s_bfe_i32 s49, s3, 0x1000f
-; GFX6-NEXT: s_bfe_i32 s50, s3, 0x1000e
-; GFX6-NEXT: s_bfe_i32 s51, s3, 0x1000d
-; GFX6-NEXT: s_bfe_i32 s52, s3, 0x1000c
+; GFX6-NEXT: s_bfe_i32 s49, s3, 0x1000e
+; GFX6-NEXT: s_bfe_i32 s50, s3, 0x1000d
+; GFX6-NEXT: s_bfe_i32 s51, s3, 0x1000c
+; GFX6-NEXT: s_bfe_i32 s52, s3, 0x10013
; GFX6-NEXT: s_bfe_i32 s53, s3, 0x10012
; GFX6-NEXT: s_bfe_i32 s54, s3, 0x10011
; GFX6-NEXT: s_bfe_i32 s55, s3, 0x10010
; GFX6-NEXT: s_bfe_i32 s56, s3, 0x10017
; GFX6-NEXT: s_bfe_i32 s57, s3, 0x10016
; GFX6-NEXT: s_bfe_i32 s58, s3, 0x10015
-; GFX6-NEXT: s_bfe_i32 s59, s3, 0x1001b
-; GFX6-NEXT: s_bfe_i32 s60, s3, 0x1001a
-; GFX6-NEXT: s_bfe_i32 s61, s3, 0x10019
-; GFX6-NEXT: s_bfe_i32 s62, s3, 0x10018
-; GFX6-NEXT: s_ashr_i32 s63, s3, 31
-; GFX6-NEXT: s_bfe_i32 s64, s3, 0x1001e
-; GFX6-NEXT: s_bfe_i32 s65, s3, 0x1001d
-; GFX6-NEXT: s_bfe_i32 s66, s3, 0x1001c
-; GFX6-NEXT: s_bfe_i32 s67, s3, 0x10014
-; GFX6-NEXT: s_bfe_i32 s68, s3, 0x10013
+; GFX6-NEXT: s_bfe_i32 s59, s3, 0x10014
+; GFX6-NEXT: s_bfe_i32 s60, s3, 0x1001b
+; GFX6-NEXT: s_bfe_i32 s61, s3, 0x1001a
+; GFX6-NEXT: s_bfe_i32 s62, s3, 0x10019
+; GFX6-NEXT: s_bfe_i32 s63, s3, 0x10018
+; GFX6-NEXT: s_ashr_i32 s64, s3, 31
+; GFX6-NEXT: s_bfe_i32 s65, s3, 0x1001e
+; GFX6-NEXT: s_bfe_i32 s66, s3, 0x1001d
+; GFX6-NEXT: s_bfe_i32 s67, s3, 0x1001c
+; GFX6-NEXT: s_bfe_i32 s68, s3, 0x1000f
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s66
-; GFX6-NEXT: v_mov_b32_e32 v1, s65
-; GFX6-NEXT: v_mov_b32_e32 v2, s64
-; GFX6-NEXT: v_mov_b32_e32 v3, s63
-; GFX6-NEXT: v_mov_b32_e32 v4, s62
-; GFX6-NEXT: v_mov_b32_e32 v5, s61
-; GFX6-NEXT: v_mov_b32_e32 v6, s60
-; GFX6-NEXT: v_mov_b32_e32 v7, s59
-; GFX6-NEXT: v_mov_b32_e32 v8, s67
+; GFX6-NEXT: v_mov_b32_e32 v0, s67
+; GFX6-NEXT: v_mov_b32_e32 v1, s66
+; GFX6-NEXT: v_mov_b32_e32 v2, s65
+; GFX6-NEXT: v_mov_b32_e32 v3, s64
+; GFX6-NEXT: v_mov_b32_e32 v4, s63
+; GFX6-NEXT: v_mov_b32_e32 v5, s62
+; GFX6-NEXT: v_mov_b32_e32 v6, s61
+; GFX6-NEXT: v_mov_b32_e32 v7, s60
+; GFX6-NEXT: v_mov_b32_e32 v8, s59
; GFX6-NEXT: v_mov_b32_e32 v9, s58
; GFX6-NEXT: v_mov_b32_e32 v10, s57
; GFX6-NEXT: v_mov_b32_e32 v11, s56
; GFX6-NEXT: v_mov_b32_e32 v12, s55
; GFX6-NEXT: v_mov_b32_e32 v13, s54
; GFX6-NEXT: v_mov_b32_e32 v14, s53
+; GFX6-NEXT: v_mov_b32_e32 v15, s52
+; GFX6-NEXT: v_mov_b32_e32 v16, s51
+; GFX6-NEXT: v_mov_b32_e32 v17, s50
+; GFX6-NEXT: v_mov_b32_e32 v18, s49
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
-; GFX6-NEXT: v_mov_b32_e32 v15, s68
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GFX6-NEXT: s_waitcnt expcnt(3)
-; GFX6-NEXT: v_mov_b32_e32 v0, s52
-; GFX6-NEXT: v_mov_b32_e32 v1, s51
-; GFX6-NEXT: v_mov_b32_e32 v2, s50
-; GFX6-NEXT: v_mov_b32_e32 v3, s49
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s48
+; GFX6-NEXT: v_mov_b32_e32 v19, s68
; GFX6-NEXT: v_mov_b32_e32 v1, s47
; GFX6-NEXT: v_mov_b32_e32 v2, s46
; GFX6-NEXT: v_mov_b32_e32 v3, s45
+; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s44
@@ -5099,40 +5097,40 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_add_u32 s2, s0, 48
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NEXT: s_add_u32 s4, s0, 32
+; GFX8-NEXT: s_addc_u32 s5, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v16, s5
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: v_mov_b32_e32 v23, s1
+; GFX8-NEXT: v_mov_b32_e32 v15, s4
+; GFX8-NEXT: v_mov_b32_e32 v8, v1
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v11, v1
-; GFX8-NEXT: v_mov_b32_e32 v13, v1
-; GFX8-NEXT: v_mov_b32_e32 v15, v1
-; GFX8-NEXT: v_mov_b32_e32 v22, s0
+; GFX8-NEXT: v_mov_b32_e32 v12, v1
+; GFX8-NEXT: v_mov_b32_e32 v14, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v0
; GFX8-NEXT: v_bfe_u32 v6, v0, 5, 1
; GFX8-NEXT: v_bfe_u32 v4, v0, 4, 1
-; GFX8-NEXT: v_bfe_u32 v10, v0, 3, 1
-; GFX8-NEXT: v_bfe_u32 v14, v0, 1, 1
-; GFX8-NEXT: v_and_b32_e32 v12, 1, v0
-; GFX8-NEXT: v_bfe_u32 v8, v0, 2, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 7, v24
-; GFX8-NEXT: v_bfe_u32 v0, v24, 6, 1
-; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[4:7]
-; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
-; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
-; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7]
+; GFX8-NEXT: v_mov_b32_e32 v16, s3
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v18, s1
+; GFX8-NEXT: v_mov_b32_e32 v17, s0
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v0
+; GFX8-NEXT: v_bfe_u32 v9, v0, 3, 1
+; GFX8-NEXT: v_bfe_u32 v7, v0, 2, 1
+; GFX8-NEXT: v_mov_b32_e32 v15, s2
+; GFX8-NEXT: v_bfe_u32 v13, v0, 1, 1
+; GFX8-NEXT: v_and_b32_e32 v11, 1, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 7, v6
+; GFX8-NEXT: v_bfe_u32 v0, v6, 6, 1
+; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[7:10]
+; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3]
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[11:14]
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v8i1_to_v8i64:
@@ -5728,61 +5726,63 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX6-NEXT: s_mov_b32 s1, s5
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 15, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 15, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v7, 12, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v9, 13, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v8, 13, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v11, 10, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v12, 8, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v13, 9, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v8, 6, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v10, 7, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 4, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v6, 5, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v12, 11, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v14, 8, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v16, 9, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v15, 6, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v9, 4, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v10, 5, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 2, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 3, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v16, 1, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v13, 1, v1
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1
-; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 1
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 1
-; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 1
-; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 1
-; GFX6-NEXT: v_bfe_i32 v14, v13, 0, 1
-; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 1
-; GFX6-NEXT: v_bfe_i32 v17, v5, 0, 1
-; GFX6-NEXT: v_bfe_i32 v15, v3, 0, 1
+; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 1
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 1
+; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:112
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_bfe_i32 v6, v10, 0, 1
+; GFX6-NEXT: v_bfe_i32 v4, v9, 0, 1
+; GFX6-NEXT: v_bfe_i32 v9, v8, 0, 1
+; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 1
+; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_bfe_i32 v9, v12, 0, 1
+; GFX6-NEXT: v_bfe_i32 v7, v11, 0, 1
+; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 1
+; GFX6-NEXT: v_bfe_i32 v11, v1, 0, 1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 7, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80
+; GFX6-NEXT: v_bfe_i32 v17, v1, 0, 1
+; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 1
; GFX6-NEXT: v_bfe_i32 v21, v16, 0, 1
-; GFX6-NEXT: v_bfe_i32 v19, v1, 0, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v1
-; GFX6-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GFX6-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112
-; GFX6-NEXT: v_bfe_i32 v25, v1, 0, 1
-; GFX6-NEXT: v_bfe_i32 v23, v11, 0, 1
-; GFX6-NEXT: v_bfe_i32 v29, v9, 0, 1
-; GFX6-NEXT: v_bfe_i32 v27, v7, 0, 1
-; GFX6-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GFX6-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; GFX6-NEXT: v_bfe_i32 v19, v14, 0, 1
+; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v10
-; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v12
-; GFX6-NEXT: v_ashrrev_i32_e32 v26, 31, v25
-; GFX6-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GFX6-NEXT: v_ashrrev_i32_e32 v30, 31, v29
-; GFX6-NEXT: v_ashrrev_i32_e32 v28, 31, v27
-; GFX6-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96
-; GFX6-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:80
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GFX6-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GFX6-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; GFX6-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; GFX6-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:64
+; GFX6-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GFX6-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0
+; GFX6-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_sextload_v16i1_to_v16i64:
@@ -5792,8 +5792,8 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v27, s1
-; GFX8-NEXT: v_mov_b32_e32 v26, s0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s3, v0
; GFX8-NEXT: s_lshr_b32 s2, s3, 14
@@ -5831,70 +5831,70 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s2, s0, 0x70
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v23, s3
+; GFX8-NEXT: v_mov_b32_e32 v15, s3
+; GFX8-NEXT: v_mov_b32_e32 v14, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0x60
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_mov_b32_e32 v22, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x60
-; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[2:5]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5]
+; GFX8-NEXT: v_mov_b32_e32 v15, s3
+; GFX8-NEXT: v_mov_b32_e32 v14, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x50
; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: v_mov_b32_e32 v7, s7
; GFX8-NEXT: v_mov_b32_e32 v8, s8
; GFX8-NEXT: v_mov_b32_e32 v9, s9
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[6:9]
-; GFX8-NEXT: v_mov_b32_e32 v10, s10
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[6:9]
+; GFX8-NEXT: v_mov_b32_e32 v15, s3
+; GFX8-NEXT: v_mov_b32_e32 v14, s2
; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: v_mov_b32_e32 v10, s10
; GFX8-NEXT: v_mov_b32_e32 v11, s11
; GFX8-NEXT: v_mov_b32_e32 v12, s12
; GFX8-NEXT: v_mov_b32_e32 v13, s13
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[10:13]
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
+; GFX8-NEXT: v_mov_b32_e32 v15, s3
+; GFX8-NEXT: v_mov_b32_e32 v14, s2
; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: v_mov_b32_e32 v14, s14
-; GFX8-NEXT: v_mov_b32_e32 v15, s15
-; GFX8-NEXT: v_mov_b32_e32 v16, s16
-; GFX8-NEXT: v_mov_b32_e32 v17, s17
+; GFX8-NEXT: v_mov_b32_e32 v2, s14
+; GFX8-NEXT: v_mov_b32_e32 v3, s15
+; GFX8-NEXT: v_mov_b32_e32 v4, s16
+; GFX8-NEXT: v_mov_b32_e32 v5, s17
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[14:17]
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, s18
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v18, s18
-; GFX8-NEXT: v_mov_b32_e32 v19, s19
-; GFX8-NEXT: v_mov_b32_e32 v20, s20
-; GFX8-NEXT: v_mov_b32_e32 v21, s21
+; GFX8-NEXT: v_mov_b32_e32 v7, s19
+; GFX8-NEXT: v_mov_b32_e32 v8, s20
+; GFX8-NEXT: v_mov_b32_e32 v9, s21
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[18:21]
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9]
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: v_mov_b32_e32 v22, s22
-; GFX8-NEXT: v_mov_b32_e32 v23, s23
-; GFX8-NEXT: v_mov_b32_e32 v24, s24
-; GFX8-NEXT: v_mov_b32_e32 v25, s25
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NEXT: v_mov_b32_e32 v10, s22
+; GFX8-NEXT: v_mov_b32_e32 v11, s23
+; GFX8-NEXT: v_mov_b32_e32 v12, s24
+; GFX8-NEXT: v_mov_b32_e32 v13, s25
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[22:25]
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[10:13]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v4, s26
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: v_mov_b32_e32 v6, s28
-; GFX8-NEXT: v_mov_b32_e32 v7, s29
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: v_mov_b32_e32 v14, s26
+; GFX8-NEXT: v_mov_b32_e32 v15, s27
+; GFX8-NEXT: v_mov_b32_e32 v16, s28
+; GFX8-NEXT: v_mov_b32_e32 v17, s29
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX8-NEXT: v_mov_b32_e32 v2, s30
; GFX8-NEXT: v_mov_b32_e32 v3, s31
-; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[14:17]
+; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_sextload_v16i1_to_v16i64:
@@ -6607,164 +6607,164 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0
+; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_lshr_b32 s52, s8, 30
-; GFX6-NEXT: s_lshr_b32 s46, s8, 31
-; GFX6-NEXT: s_lshr_b32 s48, s8, 28
-; GFX6-NEXT: s_lshr_b32 s36, s8, 29
-; GFX6-NEXT: s_lshr_b32 s38, s8, 26
-; GFX6-NEXT: s_lshr_b32 s26, s8, 27
-; GFX6-NEXT: s_lshr_b32 s28, s8, 24
-; GFX6-NEXT: s_lshr_b32 s4, s8, 25
-; GFX6-NEXT: s_lshr_b32 s6, s8, 22
-; GFX6-NEXT: s_lshr_b32 s10, s8, 23
-; GFX6-NEXT: s_lshr_b32 s12, s8, 20
-; GFX6-NEXT: s_lshr_b32 s14, s8, 21
-; GFX6-NEXT: s_lshr_b32 s16, s8, 18
-; GFX6-NEXT: s_lshr_b32 s18, s8, 19
-; GFX6-NEXT: s_lshr_b32 s20, s8, 16
-; GFX6-NEXT: s_lshr_b32 s22, s8, 17
-; GFX6-NEXT: s_lshr_b32 s24, s8, 14
-; GFX6-NEXT: s_lshr_b32 s30, s8, 15
-; GFX6-NEXT: s_lshr_b32 s34, s8, 12
-; GFX6-NEXT: s_lshr_b32 s40, s8, 13
-; GFX6-NEXT: s_lshr_b32 s42, s8, 10
-; GFX6-NEXT: s_lshr_b32 s44, s8, 11
-; GFX6-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v0, s50
-; GFX6-NEXT: v_mov_b32_e32 v1, s51
-; GFX6-NEXT: s_lshr_b32 s50, s8, 8
-; GFX6-NEXT: v_mov_b32_e32 v2, s52
-; GFX6-NEXT: v_mov_b32_e32 v3, s53
-; GFX6-NEXT: s_lshr_b32 s52, s8, 9
-; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v4, s46
-; GFX6-NEXT: v_mov_b32_e32 v5, s47
-; GFX6-NEXT: s_lshr_b32 s46, s8, 6
-; GFX6-NEXT: v_mov_b32_e32 v6, s48
-; GFX6-NEXT: v_mov_b32_e32 v7, s49
-; GFX6-NEXT: s_lshr_b32 s48, s8, 7
-; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT: s_lshr_b32 s38, s4, 30
+; GFX6-NEXT: s_lshr_b32 s40, s4, 31
+; GFX6-NEXT: s_lshr_b32 s34, s4, 28
+; GFX6-NEXT: s_lshr_b32 s36, s4, 29
+; GFX6-NEXT: s_lshr_b32 s28, s4, 26
+; GFX6-NEXT: s_lshr_b32 s30, s4, 27
+; GFX6-NEXT: s_lshr_b32 s24, s4, 24
+; GFX6-NEXT: s_lshr_b32 s26, s4, 25
+; GFX6-NEXT: s_lshr_b32 s20, s4, 22
+; GFX6-NEXT: s_lshr_b32 s22, s4, 23
+; GFX6-NEXT: s_lshr_b32 s18, s4, 20
+; GFX6-NEXT: s_lshr_b32 s6, s4, 21
+; GFX6-NEXT: s_lshr_b32 s8, s4, 18
+; GFX6-NEXT: s_lshr_b32 s10, s4, 19
+; GFX6-NEXT: s_lshr_b32 s12, s4, 16
+; GFX6-NEXT: s_lshr_b32 s14, s4, 17
+; GFX6-NEXT: s_lshr_b32 s16, s4, 14
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000
+; GFX6-NEXT: s_lshr_b32 s42, s4, 15
+; GFX6-NEXT: v_mov_b32_e32 v0, s44
+; GFX6-NEXT: v_mov_b32_e32 v1, s45
+; GFX6-NEXT: s_lshr_b32 s44, s4, 12
; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v2, s38
+; GFX6-NEXT: v_mov_b32_e32 v3, s39
+; GFX6-NEXT: s_lshr_b32 s38, s4, 13
+; GFX6-NEXT: v_mov_b32_e32 v4, s40
+; GFX6-NEXT: v_mov_b32_e32 v5, s41
+; GFX6-NEXT: s_lshr_b32 s40, s4, 10
+; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v6, s34
+; GFX6-NEXT: v_mov_b32_e32 v7, s35
+; GFX6-NEXT: s_lshr_b32 s34, s4, 11
; GFX6-NEXT: v_mov_b32_e32 v8, s36
; GFX6-NEXT: v_mov_b32_e32 v9, s37
-; GFX6-NEXT: s_lshr_b32 s36, s8, 4
-; GFX6-NEXT: v_mov_b32_e32 v10, s38
-; GFX6-NEXT: v_mov_b32_e32 v11, s39
-; GFX6-NEXT: s_lshr_b32 s38, s8, 5
-; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v12, s26
-; GFX6-NEXT: v_mov_b32_e32 v13, s27
-; GFX6-NEXT: s_lshr_b32 s26, s8, 2
-; GFX6-NEXT: v_mov_b32_e32 v14, s28
-; GFX6-NEXT: v_mov_b32_e32 v15, s29
-; GFX6-NEXT: s_lshr_b32 s28, s8, 3
-; GFX6-NEXT: s_lshr_b32 s8, s8, 1
-; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX6-NEXT: s_lshr_b32 s36, s4, 8
; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v10, s28
+; GFX6-NEXT: v_mov_b32_e32 v11, s29
+; GFX6-NEXT: s_lshr_b32 s28, s4, 9
+; GFX6-NEXT: v_mov_b32_e32 v12, s30
+; GFX6-NEXT: v_mov_b32_e32 v13, s31
+; GFX6-NEXT: s_lshr_b32 s30, s4, 6
; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v14, s24
+; GFX6-NEXT: v_mov_b32_e32 v15, s25
+; GFX6-NEXT: s_lshr_b32 s24, s4, 7
+; GFX6-NEXT: v_mov_b32_e32 v16, s26
+; GFX6-NEXT: v_mov_b32_e32 v17, s27
+; GFX6-NEXT: s_lshr_b32 s26, s4, 4
; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s20
+; GFX6-NEXT: v_mov_b32_e32 v3, s21
+; GFX6-NEXT: s_lshr_b32 s20, s4, 5
+; GFX6-NEXT: v_mov_b32_e32 v4, s22
+; GFX6-NEXT: v_mov_b32_e32 v5, s23
+; GFX6-NEXT: s_lshr_b32 s22, s4, 2
+; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v6, s18
+; GFX6-NEXT: v_mov_b32_e32 v7, s19
+; GFX6-NEXT: s_lshr_b32 s18, s4, 3
+; GFX6-NEXT: s_lshr_b32 s4, s4, 1
+; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224
; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208
-; GFX6-NEXT: v_mov_b32_e32 v16, s4
-; GFX6-NEXT: v_mov_b32_e32 v17, s5
; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192
-; GFX6-NEXT: s_waitcnt expcnt(3)
-; GFX6-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NEXT: v_mov_b32_e32 v3, s7
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176
+; GFX6-NEXT: v_mov_b32_e32 v8, s6
+; GFX6-NEXT: v_mov_b32_e32 v9, s7
+; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160
+; GFX6-NEXT: s_waitcnt expcnt(1)
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: v_mov_b32_e32 v3, s9
; GFX6-NEXT: v_mov_b32_e32 v4, s10
; GFX6-NEXT: v_mov_b32_e32 v5, s11
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s12
; GFX6-NEXT: v_mov_b32_e32 v3, s13
; GFX6-NEXT: v_mov_b32_e32 v4, s14
; GFX6-NEXT: v_mov_b32_e32 v5, s15
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s16
; GFX6-NEXT: v_mov_b32_e32 v3, s17
-; GFX6-NEXT: v_mov_b32_e32 v4, s18
-; GFX6-NEXT: v_mov_b32_e32 v5, s19
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: v_mov_b32_e32 v3, s21
-; GFX6-NEXT: v_mov_b32_e32 v4, s22
-; GFX6-NEXT: v_mov_b32_e32 v5, s23
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s24
-; GFX6-NEXT: v_mov_b32_e32 v3, s25
-; GFX6-NEXT: v_mov_b32_e32 v4, s30
-; GFX6-NEXT: v_mov_b32_e32 v5, s31
+; GFX6-NEXT: v_mov_b32_e32 v4, s42
+; GFX6-NEXT: v_mov_b32_e32 v5, s43
; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s34
-; GFX6-NEXT: v_mov_b32_e32 v3, s35
-; GFX6-NEXT: v_mov_b32_e32 v4, s40
-; GFX6-NEXT: v_mov_b32_e32 v5, s41
+; GFX6-NEXT: v_mov_b32_e32 v2, s44
+; GFX6-NEXT: v_mov_b32_e32 v3, s45
+; GFX6-NEXT: v_mov_b32_e32 v4, s38
+; GFX6-NEXT: v_mov_b32_e32 v5, s39
; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s42
-; GFX6-NEXT: v_mov_b32_e32 v3, s43
-; GFX6-NEXT: v_mov_b32_e32 v4, s44
-; GFX6-NEXT: v_mov_b32_e32 v5, s45
+; GFX6-NEXT: v_mov_b32_e32 v2, s40
+; GFX6-NEXT: v_mov_b32_e32 v3, s41
+; GFX6-NEXT: v_mov_b32_e32 v4, s34
+; GFX6-NEXT: v_mov_b32_e32 v5, s35
; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s50
-; GFX6-NEXT: v_mov_b32_e32 v3, s51
-; GFX6-NEXT: v_mov_b32_e32 v4, s52
-; GFX6-NEXT: v_mov_b32_e32 v5, s53
+; GFX6-NEXT: v_mov_b32_e32 v2, s36
+; GFX6-NEXT: v_mov_b32_e32 v3, s37
+; GFX6-NEXT: v_mov_b32_e32 v4, s28
+; GFX6-NEXT: v_mov_b32_e32 v5, s29
; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s46
-; GFX6-NEXT: v_mov_b32_e32 v3, s47
-; GFX6-NEXT: v_mov_b32_e32 v4, s48
-; GFX6-NEXT: v_mov_b32_e32 v5, s49
+; GFX6-NEXT: v_mov_b32_e32 v2, s30
+; GFX6-NEXT: v_mov_b32_e32 v3, s31
+; GFX6-NEXT: v_mov_b32_e32 v4, s24
+; GFX6-NEXT: v_mov_b32_e32 v5, s25
; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s36
-; GFX6-NEXT: v_mov_b32_e32 v3, s37
-; GFX6-NEXT: v_mov_b32_e32 v4, s38
-; GFX6-NEXT: v_mov_b32_e32 v5, s39
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32
-; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s26
; GFX6-NEXT: v_mov_b32_e32 v3, s27
-; GFX6-NEXT: v_mov_b32_e32 v4, s28
-; GFX6-NEXT: v_mov_b32_e32 v5, s29
+; GFX6-NEXT: v_mov_b32_e32 v4, s20
+; GFX6-NEXT: v_mov_b32_e32 v5, s21
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s22
+; GFX6-NEXT: v_mov_b32_e32 v3, s23
+; GFX6-NEXT: v_mov_b32_e32 v4, s18
+; GFX6-NEXT: v_mov_b32_e32 v5, s19
; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: v_mov_b32_e32 v3, s9
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
@@ -7332,21 +7332,21 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX6-NEXT: s_bfe_u32 s29, s2, 0x1001b
; GFX6-NEXT: s_bfe_u32 s31, s2, 0x1001d
; GFX6-NEXT: s_lshr_b32 s34, s2, 31
-; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10003
-; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10005
-; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10007
-; GFX6-NEXT: s_bfe_u32 s39, s3, 0x10009
-; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000b
-; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000d
-; GFX6-NEXT: s_bfe_u32 s42, s3, 0x1000f
-; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10011
-; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10013
-; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10015
-; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10017
-; GFX6-NEXT: s_bfe_u32 s47, s3, 0x10019
-; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001b
-; GFX6-NEXT: s_bfe_u32 s49, s3, 0x1001d
-; GFX6-NEXT: s_lshr_b32 s50, s3, 31
+; GFX6-NEXT: s_bfe_u32 s35, s3, 0x10003
+; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10005
+; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10007
+; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10009
+; GFX6-NEXT: s_bfe_u32 s39, s3, 0x1000b
+; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000d
+; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000f
+; GFX6-NEXT: s_bfe_u32 s42, s3, 0x10011
+; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10013
+; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10015
+; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10017
+; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10019
+; GFX6-NEXT: s_bfe_u32 s47, s3, 0x1001b
+; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001d
+; GFX6-NEXT: s_lshr_b32 s49, s3, 31
; GFX6-NEXT: s_bfe_u32 s9, s3, 0x10001
; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10001
; GFX6-NEXT: s_and_b32 s7, s2, 1
@@ -7362,7 +7362,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX6-NEXT: s_bfe_u32 s28, s2, 0x10012
; GFX6-NEXT: s_bfe_u32 s30, s2, 0x10014
; GFX6-NEXT: s_bfe_u32 s33, s2, 0x10016
-; GFX6-NEXT: s_bfe_u32 s35, s2, 0x10018
+; GFX6-NEXT: s_bfe_u32 s50, s2, 0x10018
; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001a
; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c
; GFX6-NEXT: s_bfe_u32 s53, s2, 0x1001e
@@ -7386,63 +7386,63 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: v_mov_b32_e32 v0, s67
-; GFX6-NEXT: v_mov_b32_e32 v2, s50
+; GFX6-NEXT: v_mov_b32_e32 v2, s49
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s68
-; GFX6-NEXT: v_mov_b32_e32 v2, s49
+; GFX6-NEXT: v_mov_b32_e32 v2, s48
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:480
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s66
-; GFX6-NEXT: v_mov_b32_e32 v2, s48
+; GFX6-NEXT: v_mov_b32_e32 v2, s47
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:464
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s65
-; GFX6-NEXT: v_mov_b32_e32 v2, s47
+; GFX6-NEXT: v_mov_b32_e32 v2, s46
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:448
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s64
-; GFX6-NEXT: v_mov_b32_e32 v2, s46
+; GFX6-NEXT: v_mov_b32_e32 v2, s45
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s63
-; GFX6-NEXT: v_mov_b32_e32 v2, s45
+; GFX6-NEXT: v_mov_b32_e32 v2, s44
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s62
-; GFX6-NEXT: v_mov_b32_e32 v2, s44
+; GFX6-NEXT: v_mov_b32_e32 v2, s43
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s61
-; GFX6-NEXT: v_mov_b32_e32 v2, s43
+; GFX6-NEXT: v_mov_b32_e32 v2, s42
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s60
-; GFX6-NEXT: v_mov_b32_e32 v2, s42
+; GFX6-NEXT: v_mov_b32_e32 v2, s41
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s59
-; GFX6-NEXT: v_mov_b32_e32 v2, s41
+; GFX6-NEXT: v_mov_b32_e32 v2, s40
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s58
-; GFX6-NEXT: v_mov_b32_e32 v2, s40
+; GFX6-NEXT: v_mov_b32_e32 v2, s39
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s57
-; GFX6-NEXT: v_mov_b32_e32 v2, s39
+; GFX6-NEXT: v_mov_b32_e32 v2, s38
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s56
-; GFX6-NEXT: v_mov_b32_e32 v2, s38
+; GFX6-NEXT: v_mov_b32_e32 v2, s37
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s55
-; GFX6-NEXT: v_mov_b32_e32 v2, s37
+; GFX6-NEXT: v_mov_b32_e32 v2, s36
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s54
-; GFX6-NEXT: v_mov_b32_e32 v2, s36
+; GFX6-NEXT: v_mov_b32_e32 v2, s35
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s53
@@ -7457,7 +7457,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX6-NEXT: v_mov_b32_e32 v2, s29
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s35
+; GFX6-NEXT: v_mov_b32_e32 v0, s50
; GFX6-NEXT: v_mov_b32_e32 v2, s27
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -8347,478 +8347,477 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_lshr_b32 s48, s5, 30
-; GFX6-NEXT: s_lshr_b32 s46, s5, 28
-; GFX6-NEXT: s_lshr_b32 s44, s5, 29
-; GFX6-NEXT: s_lshr_b32 s40, s5, 26
-; GFX6-NEXT: s_lshr_b32 s42, s5, 27
-; GFX6-NEXT: s_lshr_b32 s36, s5, 24
-; GFX6-NEXT: s_lshr_b32 s38, s5, 25
-; GFX6-NEXT: s_lshr_b32 s30, s5, 22
-; GFX6-NEXT: s_lshr_b32 s34, s5, 23
-; GFX6-NEXT: s_lshr_b32 s26, s5, 20
-; GFX6-NEXT: s_lshr_b32 s28, s5, 21
-; GFX6-NEXT: s_lshr_b32 s22, s5, 18
-; GFX6-NEXT: s_lshr_b32 s24, s5, 19
-; GFX6-NEXT: s_lshr_b32 s18, s5, 16
-; GFX6-NEXT: s_lshr_b32 s20, s5, 17
-; GFX6-NEXT: s_lshr_b32 s14, s5, 14
-; GFX6-NEXT: s_lshr_b32 s16, s5, 15
-; GFX6-NEXT: s_lshr_b32 s10, s5, 12
-; GFX6-NEXT: s_lshr_b32 s12, s5, 13
-; GFX6-NEXT: s_lshr_b32 s6, s5, 10
-; GFX6-NEXT: s_lshr_b32 s8, s5, 11
-; GFX6-NEXT: s_mov_b32 s50, s5
-; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[52:53], s[4:5], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v0, s50
-; GFX6-NEXT: v_mov_b32_e32 v1, s51
-; GFX6-NEXT: s_lshr_b32 s50, s5, 8
-; GFX6-NEXT: v_mov_b32_e32 v4, s52
-; GFX6-NEXT: v_mov_b32_e32 v5, s53
-; GFX6-NEXT: s_lshr_b32 s52, s5, 9
-; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[54:55], s[46:47], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v6, s48
-; GFX6-NEXT: v_mov_b32_e32 v7, s49
-; GFX6-NEXT: s_lshr_b32 s46, s5, 6
-; GFX6-NEXT: v_mov_b32_e32 v10, s54
-; GFX6-NEXT: v_mov_b32_e32 v11, s55
-; GFX6-NEXT: s_lshr_b32 s48, s5, 7
-; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX6-NEXT: s_lshr_b32 s42, s5, 30
+; GFX6-NEXT: s_lshr_b32 s36, s5, 28
+; GFX6-NEXT: s_lshr_b32 s38, s5, 29
+; GFX6-NEXT: s_lshr_b32 s30, s5, 26
+; GFX6-NEXT: s_lshr_b32 s34, s5, 27
+; GFX6-NEXT: s_lshr_b32 s26, s5, 24
+; GFX6-NEXT: s_lshr_b32 s28, s5, 25
+; GFX6-NEXT: s_lshr_b32 s22, s5, 22
+; GFX6-NEXT: s_lshr_b32 s24, s5, 23
+; GFX6-NEXT: s_lshr_b32 s18, s5, 20
+; GFX6-NEXT: s_lshr_b32 s20, s5, 21
+; GFX6-NEXT: s_lshr_b32 s14, s5, 18
+; GFX6-NEXT: s_lshr_b32 s16, s5, 19
+; GFX6-NEXT: s_lshr_b32 s10, s5, 16
+; GFX6-NEXT: s_lshr_b32 s12, s5, 17
+; GFX6-NEXT: s_lshr_b32 s6, s5, 14
+; GFX6-NEXT: s_lshr_b32 s8, s5, 15
+; GFX6-NEXT: s_mov_b32 s40, s5
; GFX6-NEXT: s_ashr_i32 s7, s5, 31
-; GFX6-NEXT: v_mov_b32_e32 v12, s44
-; GFX6-NEXT: v_mov_b32_e32 v13, s45
-; GFX6-NEXT: s_lshr_b32 s44, s5, 4
-; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v14, s40
-; GFX6-NEXT: v_mov_b32_e32 v15, s41
-; GFX6-NEXT: s_lshr_b32 s42, s5, 5
-; GFX6-NEXT: v_mov_b32_e32 v16, s54
-; GFX6-NEXT: v_mov_b32_e32 v17, s55
-; GFX6-NEXT: s_lshr_b32 s40, s5, 2
-; GFX6-NEXT: v_mov_b32_e32 v8, s7
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[40:41], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v4, s7
+; GFX6-NEXT: s_lshr_b32 s40, s5, 12
+; GFX6-NEXT: v_mov_b32_e32 v0, s44
+; GFX6-NEXT: v_mov_b32_e32 v1, s45
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v6, s44
+; GFX6-NEXT: v_mov_b32_e32 v7, s45
+; GFX6-NEXT: s_lshr_b32 s44, s5, 13
+; GFX6-NEXT: v_mov_b32_e32 v2, s42
+; GFX6-NEXT: v_mov_b32_e32 v3, s43
+; GFX6-NEXT: s_lshr_b32 s42, s5, 10
; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v9, s7
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:496
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, s36
-; GFX6-NEXT: v_mov_b32_e32 v7, s37
-; GFX6-NEXT: s_lshr_b32 s36, s5, 3
-; GFX6-NEXT: v_mov_b32_e32 v8, s38
-; GFX6-NEXT: v_mov_b32_e32 v9, s39
-; GFX6-NEXT: s_lshr_b32 s38, s5, 1
+; GFX6-NEXT: v_mov_b32_e32 v8, s36
+; GFX6-NEXT: v_mov_b32_e32 v9, s37
+; GFX6-NEXT: s_lshr_b32 s36, s5, 11
+; GFX6-NEXT: v_mov_b32_e32 v10, s38
+; GFX6-NEXT: v_mov_b32_e32 v11, s39
+; GFX6-NEXT: s_lshr_b32 s38, s5, 8
; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:480
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v10, s30
-; GFX6-NEXT: v_mov_b32_e32 v11, s31
-; GFX6-NEXT: s_lshr_b32 s30, s4, 30
-; GFX6-NEXT: v_mov_b32_e32 v12, s34
-; GFX6-NEXT: v_mov_b32_e32 v13, s35
-; GFX6-NEXT: s_lshr_b32 s34, s4, 31
+; GFX6-NEXT: v_mov_b32_e32 v12, s30
+; GFX6-NEXT: v_mov_b32_e32 v13, s31
+; GFX6-NEXT: s_lshr_b32 s30, s5, 9
+; GFX6-NEXT: v_mov_b32_e32 v14, s34
+; GFX6-NEXT: v_mov_b32_e32 v15, s35
+; GFX6-NEXT: s_lshr_b32 s34, s5, 6
; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:464
+; GFX6-NEXT: v_mov_b32_e32 v5, s7
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:496
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v14, s26
-; GFX6-NEXT: v_mov_b32_e32 v15, s27
-; GFX6-NEXT: s_lshr_b32 s26, s4, 28
-; GFX6-NEXT: v_mov_b32_e32 v16, s28
-; GFX6-NEXT: v_mov_b32_e32 v17, s29
-; GFX6-NEXT: s_lshr_b32 s28, s4, 29
+; GFX6-NEXT: v_mov_b32_e32 v2, s26
+; GFX6-NEXT: v_mov_b32_e32 v3, s27
+; GFX6-NEXT: s_lshr_b32 s26, s5, 7
+; GFX6-NEXT: v_mov_b32_e32 v4, s28
+; GFX6-NEXT: v_mov_b32_e32 v5, s29
+; GFX6-NEXT: s_lshr_b32 s28, s5, 4
; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:448
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, s22
-; GFX6-NEXT: v_mov_b32_e32 v7, s23
-; GFX6-NEXT: s_lshr_b32 s22, s4, 26
-; GFX6-NEXT: v_mov_b32_e32 v8, s24
-; GFX6-NEXT: v_mov_b32_e32 v9, s25
-; GFX6-NEXT: s_lshr_b32 s24, s4, 27
-; GFX6-NEXT: s_bfe_i64 s[54:55], s[20:21], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v8, s22
+; GFX6-NEXT: v_mov_b32_e32 v9, s23
+; GFX6-NEXT: s_lshr_b32 s22, s5, 5
+; GFX6-NEXT: v_mov_b32_e32 v10, s24
+; GFX6-NEXT: v_mov_b32_e32 v11, s25
+; GFX6-NEXT: s_lshr_b32 s24, s5, 2
+; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:432
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v10, s18
-; GFX6-NEXT: v_mov_b32_e32 v11, s19
-; GFX6-NEXT: s_lshr_b32 s20, s4, 24
-; GFX6-NEXT: v_mov_b32_e32 v12, s54
-; GFX6-NEXT: v_mov_b32_e32 v13, s55
-; GFX6-NEXT: s_lshr_b32 s18, s4, 25
+; GFX6-NEXT: v_mov_b32_e32 v12, s18
+; GFX6-NEXT: v_mov_b32_e32 v13, s19
+; GFX6-NEXT: s_lshr_b32 s18, s5, 3
+; GFX6-NEXT: v_mov_b32_e32 v14, s20
+; GFX6-NEXT: v_mov_b32_e32 v15, s21
+; GFX6-NEXT: s_lshr_b32 s20, s5, 1
; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:416
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:448
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v14, s14
-; GFX6-NEXT: v_mov_b32_e32 v15, s15
-; GFX6-NEXT: s_lshr_b32 s14, s4, 22
-; GFX6-NEXT: v_mov_b32_e32 v16, s16
-; GFX6-NEXT: v_mov_b32_e32 v17, s17
-; GFX6-NEXT: s_lshr_b32 s16, s4, 23
+; GFX6-NEXT: v_mov_b32_e32 v2, s14
+; GFX6-NEXT: v_mov_b32_e32 v3, s15
+; GFX6-NEXT: s_lshr_b32 s14, s4, 30
+; GFX6-NEXT: v_mov_b32_e32 v4, s16
+; GFX6-NEXT: v_mov_b32_e32 v5, s17
+; GFX6-NEXT: s_lshr_b32 s16, s4, 31
; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:400
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:432
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, s10
-; GFX6-NEXT: v_mov_b32_e32 v7, s11
-; GFX6-NEXT: s_lshr_b32 s10, s4, 20
-; GFX6-NEXT: v_mov_b32_e32 v8, s12
-; GFX6-NEXT: v_mov_b32_e32 v9, s13
-; GFX6-NEXT: s_lshr_b32 s12, s4, 21
+; GFX6-NEXT: v_mov_b32_e32 v8, s10
+; GFX6-NEXT: v_mov_b32_e32 v9, s11
+; GFX6-NEXT: s_lshr_b32 s10, s4, 28
+; GFX6-NEXT: v_mov_b32_e32 v10, s12
+; GFX6-NEXT: v_mov_b32_e32 v11, s13
+; GFX6-NEXT: s_lshr_b32 s12, s4, 29
; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:384
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:416
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v12, s6
+; GFX6-NEXT: v_mov_b32_e32 v13, s7
+; GFX6-NEXT: s_lshr_b32 s46, s4, 26
+; GFX6-NEXT: v_mov_b32_e32 v14, s8
+; GFX6-NEXT: v_mov_b32_e32 v15, s9
+; GFX6-NEXT: s_lshr_b32 s8, s4, 27
+; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:400
; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s40
+; GFX6-NEXT: v_mov_b32_e32 v3, s41
+; GFX6-NEXT: s_lshr_b32 s40, s4, 24
+; GFX6-NEXT: v_mov_b32_e32 v4, s6
+; GFX6-NEXT: v_mov_b32_e32 v5, s7
+; GFX6-NEXT: s_lshr_b32 s44, s4, 25
+; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:384
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, s36
+; GFX6-NEXT: v_mov_b32_e32 v9, s37
+; GFX6-NEXT: s_lshr_b32 s36, s4, 22
; GFX6-NEXT: v_mov_b32_e32 v10, s6
; GFX6-NEXT: v_mov_b32_e32 v11, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 18
-; GFX6-NEXT: v_mov_b32_e32 v12, s8
-; GFX6-NEXT: v_mov_b32_e32 v13, s9
-; GFX6-NEXT: s_lshr_b32 s8, s4, 19
-; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:368
+; GFX6-NEXT: s_lshr_b32 s42, s4, 23
+; GFX6-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[30:31], s[38:39], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:368
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v12, s30
+; GFX6-NEXT: v_mov_b32_e32 v13, s31
+; GFX6-NEXT: s_lshr_b32 s30, s4, 20
+; GFX6-NEXT: v_mov_b32_e32 v14, s6
+; GFX6-NEXT: v_mov_b32_e32 v15, s7
+; GFX6-NEXT: s_lshr_b32 s6, s4, 21
+; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352
+; GFX6-NEXT: v_mov_b32_e32 v16, s34
+; GFX6-NEXT: v_mov_b32_e32 v17, s35
+; GFX6-NEXT: s_lshr_b32 s34, s4, 18
+; GFX6-NEXT: v_mov_b32_e32 v18, s26
+; GFX6-NEXT: v_mov_b32_e32 v19, s27
+; GFX6-NEXT: s_lshr_b32 s26, s4, 19
+; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:336
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, s28
+; GFX6-NEXT: v_mov_b32_e32 v9, s29
+; GFX6-NEXT: s_lshr_b32 s28, s4, 16
+; GFX6-NEXT: v_mov_b32_e32 v10, s22
+; GFX6-NEXT: v_mov_b32_e32 v11, s23
+; GFX6-NEXT: s_lshr_b32 s22, s4, 17
+; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:320
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v14, s50
-; GFX6-NEXT: v_mov_b32_e32 v15, s51
-; GFX6-NEXT: s_lshr_b32 s50, s4, 16
-; GFX6-NEXT: v_mov_b32_e32 v16, s52
-; GFX6-NEXT: v_mov_b32_e32 v17, s53
-; GFX6-NEXT: s_lshr_b32 s52, s4, 17
-; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:352
+; GFX6-NEXT: v_mov_b32_e32 v12, s24
+; GFX6-NEXT: v_mov_b32_e32 v13, s25
+; GFX6-NEXT: s_lshr_b32 s24, s4, 14
+; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v14, s18
+; GFX6-NEXT: v_mov_b32_e32 v15, s19
+; GFX6-NEXT: s_lshr_b32 s18, s4, 15
+; GFX6-NEXT: v_mov_b32_e32 v2, s20
+; GFX6-NEXT: v_mov_b32_e32 v3, s21
+; GFX6-NEXT: s_lshr_b32 s20, s4, 12
+; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:304
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v16, s14
+; GFX6-NEXT: v_mov_b32_e32 v17, s15
+; GFX6-NEXT: s_lshr_b32 s14, s4, 13
+; GFX6-NEXT: v_mov_b32_e32 v18, s16
+; GFX6-NEXT: v_mov_b32_e32 v19, s17
+; GFX6-NEXT: s_lshr_b32 s16, s4, 10
+; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:288
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, s46
-; GFX6-NEXT: v_mov_b32_e32 v7, s47
-; GFX6-NEXT: s_lshr_b32 s46, s4, 14
-; GFX6-NEXT: v_mov_b32_e32 v8, s48
-; GFX6-NEXT: v_mov_b32_e32 v9, s49
-; GFX6-NEXT: s_lshr_b32 s48, s4, 15
-; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[42:43], s[44:45], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:336
+; GFX6-NEXT: v_mov_b32_e32 v8, s10
+; GFX6-NEXT: v_mov_b32_e32 v9, s11
+; GFX6-NEXT: s_lshr_b32 s10, s4, 11
+; GFX6-NEXT: v_mov_b32_e32 v10, s12
+; GFX6-NEXT: v_mov_b32_e32 v11, s13
+; GFX6-NEXT: s_lshr_b32 s12, s4, 8
+; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[38:39], s[46:47], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:272
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v10, s42
-; GFX6-NEXT: v_mov_b32_e32 v11, s43
-; GFX6-NEXT: s_lshr_b32 s42, s4, 12
-; GFX6-NEXT: v_mov_b32_e32 v12, s54
-; GFX6-NEXT: v_mov_b32_e32 v13, s55
-; GFX6-NEXT: s_lshr_b32 s44, s4, 13
+; GFX6-NEXT: v_mov_b32_e32 v12, s38
+; GFX6-NEXT: v_mov_b32_e32 v13, s39
+; GFX6-NEXT: s_lshr_b32 s38, s4, 9
+; GFX6-NEXT: v_mov_b32_e32 v14, s8
+; GFX6-NEXT: v_mov_b32_e32 v15, s9
+; GFX6-NEXT: s_lshr_b32 s8, s4, 6
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:320
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v14, s40
-; GFX6-NEXT: v_mov_b32_e32 v15, s41
-; GFX6-NEXT: s_lshr_b32 s40, s4, 10
+; GFX6-NEXT: v_mov_b32_e32 v0, s40
+; GFX6-NEXT: v_mov_b32_e32 v1, s41
+; GFX6-NEXT: s_lshr_b32 s40, s4, 7
+; GFX6-NEXT: v_mov_b32_e32 v2, s44
+; GFX6-NEXT: v_mov_b32_e32 v3, s45
+; GFX6-NEXT: s_lshr_b32 s44, s4, 4
+; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240
+; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v16, s36
; GFX6-NEXT: v_mov_b32_e32 v17, s37
-; GFX6-NEXT: s_lshr_b32 s36, s4, 11
-; GFX6-NEXT: v_mov_b32_e32 v2, s38
-; GFX6-NEXT: v_mov_b32_e32 v3, s39
-; GFX6-NEXT: s_lshr_b32 s38, s4, 8
-; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT: s_lshr_b32 s36, s4, 5
+; GFX6-NEXT: v_mov_b32_e32 v18, s42
+; GFX6-NEXT: v_mov_b32_e32 v19, s43
+; GFX6-NEXT: s_lshr_b32 s42, s4, 2
; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:304
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, s30
-; GFX6-NEXT: v_mov_b32_e32 v7, s31
-; GFX6-NEXT: s_lshr_b32 s30, s4, 9
-; GFX6-NEXT: v_mov_b32_e32 v8, s34
-; GFX6-NEXT: v_mov_b32_e32 v9, s35
-; GFX6-NEXT: s_lshr_b32 s34, s4, 6
-; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:288
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v10, s26
-; GFX6-NEXT: v_mov_b32_e32 v11, s27
-; GFX6-NEXT: s_lshr_b32 s26, s4, 7
-; GFX6-NEXT: v_mov_b32_e32 v12, s28
-; GFX6-NEXT: v_mov_b32_e32 v13, s29
-; GFX6-NEXT: s_lshr_b32 s28, s4, 4
-; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:272
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v14, s22
-; GFX6-NEXT: v_mov_b32_e32 v15, s23
-; GFX6-NEXT: s_lshr_b32 s22, s4, 5
-; GFX6-NEXT: v_mov_b32_e32 v16, s24
-; GFX6-NEXT: v_mov_b32_e32 v17, s25
-; GFX6-NEXT: s_lshr_b32 s24, s4, 2
-; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s20
-; GFX6-NEXT: v_mov_b32_e32 v1, s21
-; GFX6-NEXT: s_lshr_b32 s20, s4, 3
+; GFX6-NEXT: v_mov_b32_e32 v8, s30
+; GFX6-NEXT: v_mov_b32_e32 v9, s31
+; GFX6-NEXT: s_lshr_b32 s30, s4, 3
; GFX6-NEXT: s_lshr_b32 s4, s4, 1
; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240
-; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:224
-; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208
-; GFX6-NEXT: v_mov_b32_e32 v2, s18
-; GFX6-NEXT: v_mov_b32_e32 v3, s19
+; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mov_b32_e32 v1, s15
-; GFX6-NEXT: v_mov_b32_e32 v2, s16
-; GFX6-NEXT: v_mov_b32_e32 v3, s17
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s10
-; GFX6-NEXT: v_mov_b32_e32 v1, s11
-; GFX6-NEXT: v_mov_b32_e32 v2, s12
-; GFX6-NEXT: v_mov_b32_e32 v3, s13
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s6
-; GFX6-NEXT: v_mov_b32_e32 v1, s7
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: v_mov_b32_e32 v3, s9
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
+; GFX6-NEXT: v_mov_b32_e32 v10, s6
+; GFX6-NEXT: v_mov_b32_e32 v11, s7
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
+; GFX6-NEXT: s_waitcnt expcnt(2)
+; GFX6-NEXT: v_mov_b32_e32 v0, s34
+; GFX6-NEXT: v_mov_b32_e32 v1, s35
+; GFX6-NEXT: v_mov_b32_e32 v2, s26
+; GFX6-NEXT: v_mov_b32_e32 v3, s27
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s50
-; GFX6-NEXT: v_mov_b32_e32 v1, s51
-; GFX6-NEXT: v_mov_b32_e32 v2, s52
-; GFX6-NEXT: v_mov_b32_e32 v3, s53
+; GFX6-NEXT: v_mov_b32_e32 v0, s28
+; GFX6-NEXT: v_mov_b32_e32 v1, s29
+; GFX6-NEXT: v_mov_b32_e32 v2, s22
+; GFX6-NEXT: v_mov_b32_e32 v3, s23
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s46
-; GFX6-NEXT: v_mov_b32_e32 v1, s47
-; GFX6-NEXT: v_mov_b32_e32 v2, s48
-; GFX6-NEXT: v_mov_b32_e32 v3, s49
+; GFX6-NEXT: v_mov_b32_e32 v0, s24
+; GFX6-NEXT: v_mov_b32_e32 v1, s25
+; GFX6-NEXT: v_mov_b32_e32 v2, s18
+; GFX6-NEXT: v_mov_b32_e32 v3, s19
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s42
-; GFX6-NEXT: v_mov_b32_e32 v1, s43
-; GFX6-NEXT: v_mov_b32_e32 v2, s44
-; GFX6-NEXT: v_mov_b32_e32 v3, s45
+; GFX6-NEXT: v_mov_b32_e32 v0, s20
+; GFX6-NEXT: v_mov_b32_e32 v1, s21
+; GFX6-NEXT: v_mov_b32_e32 v2, s14
+; GFX6-NEXT: v_mov_b32_e32 v3, s15
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s40
-; GFX6-NEXT: v_mov_b32_e32 v1, s41
-; GFX6-NEXT: v_mov_b32_e32 v2, s36
-; GFX6-NEXT: v_mov_b32_e32 v3, s37
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: v_mov_b32_e32 v1, s17
+; GFX6-NEXT: v_mov_b32_e32 v2, s10
+; GFX6-NEXT: v_mov_b32_e32 v3, s11
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s38
-; GFX6-NEXT: v_mov_b32_e32 v1, s39
-; GFX6-NEXT: v_mov_b32_e32 v2, s30
-; GFX6-NEXT: v_mov_b32_e32 v3, s31
+; GFX6-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NEXT: v_mov_b32_e32 v1, s13
+; GFX6-NEXT: v_mov_b32_e32 v2, s38
+; GFX6-NEXT: v_mov_b32_e32 v3, s39
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s34
-; GFX6-NEXT: v_mov_b32_e32 v1, s35
-; GFX6-NEXT: v_mov_b32_e32 v2, s26
-; GFX6-NEXT: v_mov_b32_e32 v3, s27
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: v_mov_b32_e32 v1, s9
+; GFX6-NEXT: v_mov_b32_e32 v2, s40
+; GFX6-NEXT: v_mov_b32_e32 v3, s41
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s28
-; GFX6-NEXT: v_mov_b32_e32 v1, s29
-; GFX6-NEXT: v_mov_b32_e32 v2, s22
-; GFX6-NEXT: v_mov_b32_e32 v3, s23
+; GFX6-NEXT: v_mov_b32_e32 v0, s44
+; GFX6-NEXT: v_mov_b32_e32 v1, s45
+; GFX6-NEXT: v_mov_b32_e32 v2, s36
+; GFX6-NEXT: v_mov_b32_e32 v3, s37
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s24
-; GFX6-NEXT: v_mov_b32_e32 v1, s25
-; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: v_mov_b32_e32 v3, s21
+; GFX6-NEXT: v_mov_b32_e32 v0, s42
+; GFX6-NEXT: v_mov_b32_e32 v1, s43
+; GFX6-NEXT: v_mov_b32_e32 v2, s30
+; GFX6-NEXT: v_mov_b32_e32 v3, s31
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GFX6-NEXT: v_mov_b32_e32 v6, s4
-; GFX6-NEXT: v_mov_b32_e32 v7, s5
-; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GFX6-NEXT: v_mov_b32_e32 v8, s4
+; GFX6-NEXT: v_mov_b32_e32 v9, s5
+; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_sextload_v64i1_to_v64i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
-; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s90, -1
-; GFX8-NEXT: s_mov_b32 s91, 0xe80000
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; GFX8-NEXT: s_add_u32 s88, s88, s11
-; GFX8-NEXT: s_addc_u32 s89, s89, 0
+; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s0, s3, 8
+; GFX8-NEXT: s_lshr_b32 s48, s3, 15
; GFX8-NEXT: v_writelane_b32 v62, s0, 0
-; GFX8-NEXT: v_writelane_b32 v62, s1, 1
-; GFX8-NEXT: s_lshr_b32 s0, s2, 1
-; GFX8-NEXT: s_lshr_b32 s36, s3, 21
-; GFX8-NEXT: s_lshr_b32 s30, s3, 19
-; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX8-NEXT: s_lshr_b32 s74, s3, 30
-; GFX8-NEXT: s_lshr_b32 s50, s3, 31
+; GFX8-NEXT: s_lshr_b32 s30, s3, 31
; GFX8-NEXT: s_lshr_b32 s72, s3, 28
-; GFX8-NEXT: s_lshr_b32 s48, s3, 29
+; GFX8-NEXT: s_lshr_b32 s34, s3, 29
; GFX8-NEXT: s_lshr_b32 s70, s3, 26
-; GFX8-NEXT: s_lshr_b32 s46, s3, 27
+; GFX8-NEXT: s_lshr_b32 s36, s3, 27
; GFX8-NEXT: s_lshr_b32 s68, s3, 24
-; GFX8-NEXT: s_lshr_b32 s42, s3, 25
-; GFX8-NEXT: s_lshr_b32 s66, s3, 22
+; GFX8-NEXT: s_lshr_b32 s38, s3, 25
+; GFX8-NEXT: s_lshr_b32 s64, s3, 22
; GFX8-NEXT: s_lshr_b32 s40, s3, 23
-; GFX8-NEXT: s_lshr_b32 s64, s3, 20
-; GFX8-NEXT: s_lshr_b32 s62, s3, 18
+; GFX8-NEXT: s_lshr_b32 s60, s3, 20
+; GFX8-NEXT: s_lshr_b32 s42, s3, 21
+; GFX8-NEXT: s_lshr_b32 s66, s3, 18
+; GFX8-NEXT: s_lshr_b32 s44, s3, 19
; GFX8-NEXT: s_lshr_b32 s56, s3, 16
-; GFX8-NEXT: s_lshr_b32 s18, s3, 17
+; GFX8-NEXT: s_lshr_b32 s46, s3, 17
; GFX8-NEXT: s_lshr_b32 s58, s3, 14
-; GFX8-NEXT: s_lshr_b32 s38, s3, 15
-; GFX8-NEXT: s_lshr_b32 s60, s3, 12
-; GFX8-NEXT: s_lshr_b32 s44, s3, 13
+; GFX8-NEXT: s_lshr_b32 s62, s3, 12
; GFX8-NEXT: s_lshr_b32 s54, s3, 10
-; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX8-NEXT: v_writelane_b32 v62, s0, 2
+; GFX8-NEXT: v_writelane_b32 v62, s1, 1
+; GFX8-NEXT: s_lshr_b32 s0, s3, 9
+; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
; GFX8-NEXT: s_lshr_b32 s52, s3, 11
-; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000
+; GFX8-NEXT: v_writelane_b32 v62, s0, 2
+; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000
; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000
; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000
; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000
; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000
; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX8-NEXT: v_mov_b32_e32 v18, s36
-; GFX8-NEXT: v_mov_b32_e32 v19, s37
-; GFX8-NEXT: v_mov_b32_e32 v26, s30
-; GFX8-NEXT: v_mov_b32_e32 v27, s31
-; GFX8-NEXT: s_bfe_i64 s[30:31], s[44:45], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[36:37], s[38:39], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX8-NEXT: v_mov_b32_e32 v34, s48
+; GFX8-NEXT: s_lshr_b32 s48, s2, 1
+; GFX8-NEXT: s_lshr_b32 s50, s3, 13
; GFX8-NEXT: v_writelane_b32 v62, s1, 3
-; GFX8-NEXT: s_lshr_b32 s6, s3, 9
-; GFX8-NEXT: s_lshr_b32 s8, s3, 6
+; GFX8-NEXT: s_lshr_b32 s6, s3, 6
; GFX8-NEXT: s_lshr_b32 s10, s3, 7
; GFX8-NEXT: s_lshr_b32 s12, s3, 4
; GFX8-NEXT: s_lshr_b32 s14, s3, 5
; GFX8-NEXT: s_lshr_b32 s16, s3, 2
-; GFX8-NEXT: s_lshr_b32 s20, s3, 3
-; GFX8-NEXT: s_lshr_b32 s22, s3, 1
-; GFX8-NEXT: s_mov_b32 s24, s3
-; GFX8-NEXT: s_lshr_b32 s26, s2, 30
-; GFX8-NEXT: s_lshr_b32 s28, s2, 31
-; GFX8-NEXT: s_lshr_b32 s34, s2, 28
+; GFX8-NEXT: s_lshr_b32 s18, s3, 3
+; GFX8-NEXT: s_lshr_b32 s20, s3, 1
+; GFX8-NEXT: s_mov_b32 s22, s3
+; GFX8-NEXT: s_lshr_b32 s24, s2, 30
+; GFX8-NEXT: s_lshr_b32 s26, s2, 31
+; GFX8-NEXT: s_lshr_b32 s28, s2, 28
; GFX8-NEXT: v_mov_b32_e32 v4, s74
-; GFX8-NEXT: v_mov_b32_e32 v8, s72
+; GFX8-NEXT: v_mov_b32_e32 v12, s72
; GFX8-NEXT: v_mov_b32_e32 v0, s70
-; GFX8-NEXT: v_mov_b32_e32 v54, s68
-; GFX8-NEXT: v_mov_b32_e32 v20, s66
+; GFX8-NEXT: v_mov_b32_e32 v8, s68
; GFX8-NEXT: v_mov_b32_e32 v16, s64
-; GFX8-NEXT: v_mov_b32_e32 v24, s62
+; GFX8-NEXT: v_mov_b32_e32 v20, s60
+; GFX8-NEXT: v_mov_b32_e32 v24, s66
; GFX8-NEXT: v_mov_b32_e32 v28, s56
; GFX8-NEXT: v_mov_b32_e32 v32, s58
-; GFX8-NEXT: v_mov_b32_e32 v36, s60
+; GFX8-NEXT: v_mov_b32_e32 v36, s62
; GFX8-NEXT: s_lshr_b32 s86, s2, 29
; GFX8-NEXT: v_mov_b32_e32 v40, s54
; GFX8-NEXT: s_lshr_b32 s84, s2, 26
; GFX8-NEXT: s_lshr_b32 s82, s2, 27
+; GFX8-NEXT: s_bfe_i64 vcc, s[52:53], 0x10000
; GFX8-NEXT: s_lshr_b32 s80, s2, 24
-; GFX8-NEXT: v_mov_b32_e32 v6, s50
+; GFX8-NEXT: v_mov_b32_e32 v6, s30
+; GFX8-NEXT: v_mov_b32_e32 v7, s31
; GFX8-NEXT: s_lshr_b32 s78, s2, 25
; GFX8-NEXT: s_lshr_b32 s76, s2, 22
-; GFX8-NEXT: v_mov_b32_e32 v10, s48
+; GFX8-NEXT: v_mov_b32_e32 v14, s34
; GFX8-NEXT: s_lshr_b32 s74, s2, 23
; GFX8-NEXT: s_lshr_b32 s72, s2, 20
-; GFX8-NEXT: v_mov_b32_e32 v2, s46
+; GFX8-NEXT: v_mov_b32_e32 v2, s36
; GFX8-NEXT: s_lshr_b32 s70, s2, 21
; GFX8-NEXT: s_lshr_b32 s68, s2, 18
-; GFX8-NEXT: v_mov_b32_e32 v56, s42
+; GFX8-NEXT: v_mov_b32_e32 v10, s38
; GFX8-NEXT: s_lshr_b32 s66, s2, 19
; GFX8-NEXT: s_lshr_b32 s64, s2, 16
-; GFX8-NEXT: v_mov_b32_e32 v22, s40
+; GFX8-NEXT: v_mov_b32_e32 v18, s40
; GFX8-NEXT: s_lshr_b32 s62, s2, 17
; GFX8-NEXT: s_lshr_b32 s60, s2, 14
+; GFX8-NEXT: v_mov_b32_e32 v22, s42
; GFX8-NEXT: s_lshr_b32 s58, s2, 15
; GFX8-NEXT: s_lshr_b32 s56, s2, 12
+; GFX8-NEXT: v_mov_b32_e32 v26, s44
; GFX8-NEXT: s_lshr_b32 s54, s2, 13
-; GFX8-NEXT: s_bfe_i64 vcc, s[52:53], 0x10000
; GFX8-NEXT: s_lshr_b32 s52, s2, 10
-; GFX8-NEXT: v_mov_b32_e32 v30, s18
-; GFX8-NEXT: v_mov_b32_e32 v31, s19
-; GFX8-NEXT: s_lshr_b32 s50, s2, 11
-; GFX8-NEXT: s_lshr_b32 s48, s2, 8
-; GFX8-NEXT: v_mov_b32_e32 v34, s36
+; GFX8-NEXT: v_mov_b32_e32 v30, s46
+; GFX8-NEXT: s_lshr_b32 s4, s2, 11
+; GFX8-NEXT: s_lshr_b32 s0, s2, 8
; GFX8-NEXT: s_lshr_b32 s46, s2, 9
; GFX8-NEXT: s_lshr_b32 s44, s2, 6
-; GFX8-NEXT: v_mov_b32_e32 v38, s30
; GFX8-NEXT: s_lshr_b32 s42, s2, 7
; GFX8-NEXT: s_lshr_b32 s40, s2, 4
; GFX8-NEXT: s_lshr_b32 s38, s2, 5
; GFX8-NEXT: s_lshr_b32 s36, s2, 2
-; GFX8-NEXT: s_lshr_b32 s30, s2, 3
-; GFX8-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x10000
+; GFX8-NEXT: s_lshr_b32 s34, s2, 3
+; GFX8-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[2:3], s[48:49], 0x10000
+; GFX8-NEXT: v_writelane_b32 v62, s2, 4
+; GFX8-NEXT: v_writelane_b32 v62, s3, 5
+; GFX8-NEXT: v_readlane_b32 s2, v62, 2
+; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
+; GFX8-NEXT: v_readlane_b32 s3, v62, 3
+; GFX8-NEXT: v_mov_b32_e32 v38, s50
+; GFX8-NEXT: v_mov_b32_e32 v39, s51
+; GFX8-NEXT: s_bfe_i64 s[50:51], s[4:5], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000
; GFX8-NEXT: v_readlane_b32 s2, v62, 0
; GFX8-NEXT: v_readlane_b32 s3, v62, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s75
-; GFX8-NEXT: v_mov_b32_e32 v7, s51
-; GFX8-NEXT: v_mov_b32_e32 v9, s73
-; GFX8-NEXT: v_mov_b32_e32 v11, s49
+; GFX8-NEXT: v_mov_b32_e32 v13, s73
+; GFX8-NEXT: v_mov_b32_e32 v15, s35
; GFX8-NEXT: v_mov_b32_e32 v1, s71
-; GFX8-NEXT: v_mov_b32_e32 v3, s47
-; GFX8-NEXT: v_mov_b32_e32 v55, s69
-; GFX8-NEXT: v_mov_b32_e32 v57, s43
-; GFX8-NEXT: v_mov_b32_e32 v21, s67
-; GFX8-NEXT: v_mov_b32_e32 v23, s41
+; GFX8-NEXT: v_mov_b32_e32 v3, s37
+; GFX8-NEXT: v_mov_b32_e32 v9, s69
+; GFX8-NEXT: v_mov_b32_e32 v11, s39
; GFX8-NEXT: v_mov_b32_e32 v17, s65
-; GFX8-NEXT: v_mov_b32_e32 v25, s63
+; GFX8-NEXT: v_mov_b32_e32 v19, s41
+; GFX8-NEXT: v_mov_b32_e32 v21, s61
+; GFX8-NEXT: v_mov_b32_e32 v23, s43
+; GFX8-NEXT: v_mov_b32_e32 v25, s67
+; GFX8-NEXT: v_mov_b32_e32 v27, s45
; GFX8-NEXT: v_mov_b32_e32 v29, s57
+; GFX8-NEXT: v_mov_b32_e32 v31, s47
; GFX8-NEXT: v_mov_b32_e32 v33, s59
-; GFX8-NEXT: v_mov_b32_e32 v35, s37
-; GFX8-NEXT: v_mov_b32_e32 v37, s61
-; GFX8-NEXT: v_mov_b32_e32 v39, s31
+; GFX8-NEXT: v_mov_b32_e32 v35, s49
+; GFX8-NEXT: v_mov_b32_e32 v37, s63
; GFX8-NEXT: v_mov_b32_e32 v41, s55
-; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[48:49], s[0:1], 0x10000
; GFX8-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000
; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000
; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
@@ -8837,269 +8836,262 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000
; GFX8-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000
; GFX8-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
; GFX8-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
; GFX8-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[0:1], s[6:7], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000
-; GFX8-NEXT: s_add_u32 s2, s4, 0x1f0
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[10:11], s[2:3], 0x10000
+; GFX8-NEXT: s_add_u32 s2, s8, 0x1f0
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
; GFX8-NEXT: v_mov_b32_e32 v43, s3
; GFX8-NEXT: v_mov_b32_e32 v42, s2
-; GFX8-NEXT: s_add_u32 s2, s4, 0x1e0
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s8, 0x1e0
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
; GFX8-NEXT: v_mov_b32_e32 v45, s3
; GFX8-NEXT: v_mov_b32_e32 v44, s2
-; GFX8-NEXT: s_add_u32 s2, s4, 0x1d0
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s8, 0x1d0
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
; GFX8-NEXT: v_mov_b32_e32 v47, s3
; GFX8-NEXT: v_mov_b32_e32 v46, s2
-; GFX8-NEXT: s_add_u32 s2, s4, 0x1c0
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s8, 0x1c0
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
; GFX8-NEXT: v_mov_b32_e32 v49, s3
; GFX8-NEXT: v_mov_b32_e32 v48, s2
-; GFX8-NEXT: s_add_u32 s2, s4, 0x1b0
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s8, 0x1b0
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
; GFX8-NEXT: v_mov_b32_e32 v51, s3
; GFX8-NEXT: v_mov_b32_e32 v50, s2
-; GFX8-NEXT: s_add_u32 s2, s4, 0x1a0
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s8, 0x1a0
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
; GFX8-NEXT: v_mov_b32_e32 v53, s3
; GFX8-NEXT: v_mov_b32_e32 v52, s2
-; GFX8-NEXT: s_add_u32 s2, s4, 0x190
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v15, s3
-; GFX8-NEXT: v_mov_b32_e32 v14, s2
-; GFX8-NEXT: s_add_u32 s2, s4, 0x180
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: buffer_store_dword v12, off, s[88:91], 0 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7]
-; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[8:11]
-; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3]
-; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[54:57]
-; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[20:23]
-; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[16:19]
-; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[24:27]
-; GFX8-NEXT: buffer_load_dword v18, off, s[88:91], 0 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v19, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT: s_add_u32 s2, s4, 0x170
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s8, 0x190
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
+; GFX8-NEXT: v_mov_b32_e32 v55, s3
+; GFX8-NEXT: v_mov_b32_e32 v54, s2
+; GFX8-NEXT: s_add_u32 s2, s8, 0x180
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
+; GFX8-NEXT: v_mov_b32_e32 v57, s3
+; GFX8-NEXT: v_mov_b32_e32 v56, s2
+; GFX8-NEXT: s_add_u32 s2, s8, 0x170
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
; GFX8-NEXT: v_mov_b32_e32 v59, s3
; GFX8-NEXT: v_mov_b32_e32 v58, s2
-; GFX8-NEXT: s_add_u32 s2, s4, 0x160
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s8, 0x160
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
; GFX8-NEXT: v_mov_b32_e32 v61, s3
; GFX8-NEXT: v_mov_b32_e32 v60, s2
-; GFX8-NEXT: s_add_u32 s2, s4, 0x150
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v45, s3
-; GFX8-NEXT: v_mov_b32_e32 v44, s2
-; GFX8-NEXT: s_add_u32 s2, s4, 0x140
-; GFX8-NEXT: s_addc_u32 s3, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0x130
-; GFX8-NEXT: v_mov_b32_e32 v7, s1
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s8, 0x150
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
+; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15]
+; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: s_add_u32 s2, s8, 0x140
+; GFX8-NEXT: s_addc_u32 s3, s9, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s8, 0x130
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
+; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7]
+; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11]
+; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19]
+; GFX8-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NEXT: v_mov_b32_e32 v17, s1
; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0x120
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v15, s1
-; GFX8-NEXT: v_mov_b32_e32 v14, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0x110
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_add_u32 s0, s8, 0x120
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s8, 0x110
+; GFX8-NEXT: v_mov_b32_e32 v5, s11
+; GFX8-NEXT: v_mov_b32_e32 v15, s3
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo
; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_mov_b32_e32 v14, s2
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v7, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v8, s12
-; GFX8-NEXT: v_mov_b32_e32 v2, s10
-; GFX8-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23]
; GFX8-NEXT: v_mov_b32_e32 v9, s13
+; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27]
; GFX8-NEXT: v_mov_b32_e32 v10, s14
; GFX8-NEXT: v_mov_b32_e32 v11, s15
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[28:31]
+; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31]
; GFX8-NEXT: flat_store_dwordx4 v[58:59], v[32:35]
; GFX8-NEXT: flat_store_dwordx4 v[60:61], v[36:39]
-; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[40:43]
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[40:43]
+; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
-; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
+; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0x100
+; GFX8-NEXT: s_add_u32 s0, s8, 0x100
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v1, s17
+; GFX8-NEXT: v_mov_b32_e32 v2, s18
+; GFX8-NEXT: v_mov_b32_e32 v3, s19
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s8, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v0, s22
+; GFX8-NEXT: v_mov_b32_e32 v1, s23
; GFX8-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NEXT: v_mov_b32_e32 v3, s21
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0xf0
+; GFX8-NEXT: s_add_u32 s0, s8, 0xe0
; GFX8-NEXT: v_mov_b32_e32 v0, s24
; GFX8-NEXT: v_mov_b32_e32 v1, s25
-; GFX8-NEXT: v_mov_b32_e32 v2, s22
-; GFX8-NEXT: v_mov_b32_e32 v3, s23
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
-; GFX8-NEXT: v_mov_b32_e32 v0, s26
-; GFX8-NEXT: v_mov_b32_e32 v1, s27
-; GFX8-NEXT: v_mov_b32_e32 v2, s28
-; GFX8-NEXT: v_mov_b32_e32 v3, s29
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, s26
+; GFX8-NEXT: v_mov_b32_e32 v3, s27
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0xd0
-; GFX8-NEXT: v_mov_b32_e32 v0, s34
-; GFX8-NEXT: v_mov_b32_e32 v1, s35
+; GFX8-NEXT: s_add_u32 s0, s8, 0xd0
+; GFX8-NEXT: v_mov_b32_e32 v0, s28
+; GFX8-NEXT: v_mov_b32_e32 v1, s29
; GFX8-NEXT: v_mov_b32_e32 v2, s86
; GFX8-NEXT: v_mov_b32_e32 v3, s87
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
+; GFX8-NEXT: s_add_u32 s0, s8, 0xc0
; GFX8-NEXT: v_mov_b32_e32 v0, s84
; GFX8-NEXT: v_mov_b32_e32 v1, s85
; GFX8-NEXT: v_mov_b32_e32 v2, s82
; GFX8-NEXT: v_mov_b32_e32 v3, s83
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0xb0
+; GFX8-NEXT: s_add_u32 s0, s8, 0xb0
; GFX8-NEXT: v_mov_b32_e32 v0, s80
; GFX8-NEXT: v_mov_b32_e32 v1, s81
; GFX8-NEXT: v_mov_b32_e32 v2, s78
; GFX8-NEXT: v_mov_b32_e32 v3, s79
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0xa0
+; GFX8-NEXT: s_add_u32 s0, s8, 0xa0
; GFX8-NEXT: v_mov_b32_e32 v0, s76
; GFX8-NEXT: v_mov_b32_e32 v1, s77
; GFX8-NEXT: v_mov_b32_e32 v2, s74
; GFX8-NEXT: v_mov_b32_e32 v3, s75
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0x90
+; GFX8-NEXT: s_add_u32 s0, s8, 0x90
; GFX8-NEXT: v_mov_b32_e32 v0, s72
; GFX8-NEXT: v_mov_b32_e32 v1, s73
; GFX8-NEXT: v_mov_b32_e32 v2, s70
; GFX8-NEXT: v_mov_b32_e32 v3, s71
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0x80
+; GFX8-NEXT: s_add_u32 s0, s8, 0x80
; GFX8-NEXT: v_mov_b32_e32 v0, s68
; GFX8-NEXT: v_mov_b32_e32 v1, s69
; GFX8-NEXT: v_mov_b32_e32 v2, s66
; GFX8-NEXT: v_mov_b32_e32 v3, s67
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: s_add_u32 s0, s8, 0x70
; GFX8-NEXT: v_mov_b32_e32 v0, s64
; GFX8-NEXT: v_mov_b32_e32 v1, s65
; GFX8-NEXT: v_mov_b32_e32 v2, s62
; GFX8-NEXT: v_mov_b32_e32 v3, s63
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0x60
+; GFX8-NEXT: s_add_u32 s0, s8, 0x60
; GFX8-NEXT: v_mov_b32_e32 v0, s60
; GFX8-NEXT: v_mov_b32_e32 v1, s61
; GFX8-NEXT: v_mov_b32_e32 v2, s58
; GFX8-NEXT: v_mov_b32_e32 v3, s59
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NEXT: s_add_u32 s0, s8, 0x50
; GFX8-NEXT: v_mov_b32_e32 v0, s56
; GFX8-NEXT: v_mov_b32_e32 v1, s57
; GFX8-NEXT: v_mov_b32_e32 v2, s54
; GFX8-NEXT: v_mov_b32_e32 v3, s55
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 64
+; GFX8-NEXT: s_add_u32 s0, s8, 64
; GFX8-NEXT: v_mov_b32_e32 v0, s52
; GFX8-NEXT: v_mov_b32_e32 v1, s53
; GFX8-NEXT: v_mov_b32_e32 v2, s50
; GFX8-NEXT: v_mov_b32_e32 v3, s51
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_add_u32 s0, s8, 48
; GFX8-NEXT: v_mov_b32_e32 v0, s48
; GFX8-NEXT: v_mov_b32_e32 v1, s49
; GFX8-NEXT: v_mov_b32_e32 v2, s46
; GFX8-NEXT: v_mov_b32_e32 v3, s47
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_add_u32 s0, s8, 32
; GFX8-NEXT: v_mov_b32_e32 v0, s44
; GFX8-NEXT: v_mov_b32_e32 v1, s45
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_add_u32 s0, s8, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s40
; GFX8-NEXT: v_mov_b32_e32 v1, s41
; GFX8-NEXT: v_mov_b32_e32 v2, s38
; GFX8-NEXT: v_mov_b32_e32 v3, s39
-; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s36
; GFX8-NEXT: v_mov_b32_e32 v1, s37
-; GFX8-NEXT: v_mov_b32_e32 v2, s30
-; GFX8-NEXT: v_mov_b32_e32 v3, s31
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
+; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_readlane_b32 s0, v62, 2
+; GFX8-NEXT: v_readlane_b32 s0, v62, 4
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_readlane_b32 s1, v62, 3
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s18
-; GFX8-NEXT: v_mov_b32_e32 v1, s19
+; GFX8-NEXT: v_readlane_b32 s1, v62, 5
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: v_mov_b32_e32 v0, s30
+; GFX8-NEXT: v_mov_b32_e32 v1, s31
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index bb98af4e7a5c7f..255a1acbe0086f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -637,8 +637,8 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-NOHSA-VI-NEXT: flat_load_ushort v19, v[6:7]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v20, v[8:9]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v21, v[10:11]
-; GCN-NOHSA-VI-NEXT: flat_load_ushort v22, v[12:13]
-; GCN-NOHSA-VI-NEXT: flat_load_ushort v23, v[14:15]
+; GCN-NOHSA-VI-NEXT: flat_load_ushort v12, v[12:13]
+; GCN-NOHSA-VI-NEXT: flat_load_ushort v13, v[14:15]
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
@@ -664,18 +664,18 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s2
; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 2
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s0
-; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1]
-; GCN-NOHSA-VI-NEXT: flat_load_ushort v24, v[2:3]
+; GCN-NOHSA-VI-NEXT: flat_load_ushort v14, v[0:1]
+; GCN-NOHSA-VI-NEXT: flat_load_ushort v15, v[2:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
; GCN-NOHSA-VI-NEXT: flat_load_ushort v4, v[4:5]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v5, v[6:7]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
; GCN-NOHSA-VI-NEXT: flat_load_ushort v8, v[8:9]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v9, v[10:11]
-; GCN-NOHSA-VI-NEXT: flat_load_ushort v10, v[12:13]
-; GCN-NOHSA-VI-NEXT: flat_load_ushort v11, v[14:15]
+; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1]
+; GCN-NOHSA-VI-NEXT: flat_load_ushort v10, v[2:3]
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v17, v1
@@ -688,25 +688,25 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(10)
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v21, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(9)
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v6, 16, v22
+; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v6, 16, v12
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8)
-; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v23, v6
+; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v13, v6
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7)
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v6, 16, v14
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6)
-; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v24, v0
+; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v15, v6
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5)
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4)
-; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v5, v0
+; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v5, v4
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8
+; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v8
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2)
-; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v9, v0
+; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v9, v4
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v10
+; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v11, v0
+; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v10, v0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -2502,29 +2502,27 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s1, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s0, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s3, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s2, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s4, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s7, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s6, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s9, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s8, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s11, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s10, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s13, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s12, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s15, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s14, 16
-; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16
+; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s0, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s2, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff
@@ -2534,56 +2532,60 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
@@ -2622,32 +2624,32 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff
; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff
; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s0, s15, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s1, s14, 0xffff
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
+; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s30
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 64
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28
@@ -2981,88 +2983,90 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s1, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s0, 16
-; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s1, s1
-; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s1, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s19, s0, 16
+; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s20, s1
+; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s21, s0
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s22, s3, 16
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s2, 16
-; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s3, s3
-; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s2, s2
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s5, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s4, 16
+; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s24, s3
+; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s25, s2
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s5, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s4, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s7, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s6, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s7, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s6, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s9, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s8, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s9, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s8, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s11, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s10, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s11, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s10, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s13, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s12, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s13, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s36, s12, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s15, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s36, s14, 16
+; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s15, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s14, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14
-; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s25
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32:
@@ -3073,8 +3077,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16
; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16
-; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1
-; GCN-HSA-NEXT: s_sext_i32_i16 s21, s0
; GCN-HSA-NEXT: s_ashr_i32 s22, s3, 16
; GCN-HSA-NEXT: s_ashr_i32 s23, s2, 16
; GCN-HSA-NEXT: s_ashr_i32 s24, s5, 16
@@ -3087,34 +3089,36 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-HSA-NEXT: s_ashr_i32 s31, s10, 16
; GCN-HSA-NEXT: s_ashr_i32 s33, s13, 16
; GCN-HSA-NEXT: s_ashr_i32 s34, s12, 16
-; GCN-HSA-NEXT: s_ashr_i32 s0, s15, 16
-; GCN-HSA-NEXT: s_ashr_i32 s1, s14, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
+; GCN-HSA-NEXT: s_ashr_i32 s35, s15, 16
+; GCN-HSA-NEXT: s_ashr_i32 s36, s14, 16
+; GCN-HSA-NEXT: s_sext_i32_i16 s21, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15
-; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15
+; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 64
@@ -3524,18 +3528,18 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s20, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s23, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s25, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s24, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s27, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s29, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s28, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s31, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s30, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
@@ -3555,22 +3559,21 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s25
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
@@ -3652,10 +3655,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16
; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16
; GCN-HSA-NEXT: s_lshr_b32 s39, s10, 16
-; GCN-HSA-NEXT: s_lshr_b32 s41, s13, 16
-; GCN-HSA-NEXT: s_lshr_b32 s43, s12, 16
+; GCN-HSA-NEXT: s_lshr_b32 s42, s13, 16
+; GCN-HSA-NEXT: s_lshr_b32 s44, s12, 16
; GCN-HSA-NEXT: s_lshr_b32 s45, s15, 16
-; GCN-HSA-NEXT: s_lshr_b32 s47, s14, 16
+; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16
; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff
; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff
; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff
@@ -3664,13 +3667,13 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff
; GCN-HSA-NEXT: s_and_b32 s38, s7, 0xffff
; GCN-HSA-NEXT: s_and_b32 s40, s6, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s42, s9, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s44, s8, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s46, s11, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s41, s9, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s43, s8, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s47, s11, 0xffff
; GCN-HSA-NEXT: s_and_b32 s48, s10, 0xffff
; GCN-HSA-NEXT: s_and_b32 s49, s13, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s50, s12, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s51, s15, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s51, s12, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s50, s15, 0xffff
; GCN-HSA-NEXT: s_and_b32 s52, s14, 0xffff
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3708,111 +3711,111 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0
-; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63
-; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80
-; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s62
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s61
+; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11]
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0
+; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s60
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s59
+; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80
+; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0
+; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s58
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7
+; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56
-; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s54
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3
-; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s53
-; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19]
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18
+; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s51
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s42
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 64
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s43
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -3854,57 +3857,34 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0
-; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40
+; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s1, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s0, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s3, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s2, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s5, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s7, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s9, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s11, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s10, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s13, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s12, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s15, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s14, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s17, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s16, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s19, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s18, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s21, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s20, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s23, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s22, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s25, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s24, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s27, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s26, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s29, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s28, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s31, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s30, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s11, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s10, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s18, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s21, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s20, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s23, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s22, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s25, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s24, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s27, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s26, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s29, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s28, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s31, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s30, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s1, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s0, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s3, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s2, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s5, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s4, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s7, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s6, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s9, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff
; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff
; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff
@@ -3919,151 +3899,170 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff
; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff
; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s31, 0xffff
-; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s30, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xf0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
+; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s8, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s11, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s10, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s13, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s12, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s15, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s14, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s15, 0xffff
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s14, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xf0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xe0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xe0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xd0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xd0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xc0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xc0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xb0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xb0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xa0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xa0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x90
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x90
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x80
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x80
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x70
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x70
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x60
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x60
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x50
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s38
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x50
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 64
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_nop 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 64
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_nop 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s36, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_nop 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s36, 32
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_nop 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s36, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s37
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -4437,16 +4436,17 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s4, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s7, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s6, 16
-; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s6, 16
+; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s62, s7
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s8, 16
-; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s64, s9
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s9, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s8, 16
+; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s11, 16
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s10, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11
+; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s13, 16
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s12, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13
@@ -4455,8 +4455,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s70, s14, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14
-; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s9, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s7, 16
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
@@ -4474,24 +4473,23 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s11
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s63
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s64
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s9
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s61
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58
@@ -4586,10 +4584,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: s_sext_i32_i16 s43, s11
; GCN-HSA-NEXT: s_sext_i32_i16 s44, s10
; GCN-HSA-NEXT: s_ashr_i32 s45, s13, 16
-; GCN-HSA-NEXT: s_ashr_i32 s46, s12, 16
-; GCN-HSA-NEXT: s_sext_i32_i16 s47, s13
-; GCN-HSA-NEXT: s_sext_i32_i16 s48, s12
-; GCN-HSA-NEXT: s_ashr_i32 s49, s15, 16
+; GCN-HSA-NEXT: s_ashr_i32 s47, s12, 16
+; GCN-HSA-NEXT: s_sext_i32_i16 s46, s13
+; GCN-HSA-NEXT: s_sext_i32_i16 s49, s12
+; GCN-HSA-NEXT: s_ashr_i32 s48, s15, 16
; GCN-HSA-NEXT: s_ashr_i32 s50, s14, 16
; GCN-HSA-NEXT: s_sext_i32_i16 s51, s15
; GCN-HSA-NEXT: s_sext_i32_i16 s52, s14
@@ -4597,8 +4595,8 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16
; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16
-; GCN-HSA-NEXT: s_ashr_i32 s55, s3, 16
-; GCN-HSA-NEXT: s_ashr_i32 s56, s2, 16
+; GCN-HSA-NEXT: s_ashr_i32 s53, s3, 16
+; GCN-HSA-NEXT: s_ashr_i32 s54, s2, 16
; GCN-HSA-NEXT: s_ashr_i32 s57, s5, 16
; GCN-HSA-NEXT: s_ashr_i32 s58, s4, 16
; GCN-HSA-NEXT: s_ashr_i32 s59, s7, 16
@@ -4611,114 +4609,114 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: s_ashr_i32 s66, s12, 16
; GCN-HSA-NEXT: s_ashr_i32 s67, s15, 16
; GCN-HSA-NEXT: s_ashr_i32 s68, s14, 16
-; GCN-HSA-NEXT: s_sext_i32_i16 s54, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0
-; GCN-HSA-NEXT: s_sext_i32_i16 s53, s1
-; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0
-; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0
-; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0
-; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0
-; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0
-; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1
+; GCN-HSA-NEXT: s_sext_i32_i16 s56, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0
+; GCN-HSA-NEXT: s_sext_i32_i16 s55, s3
+; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0
+; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0
+; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0
+; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0
+; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3
+; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11
+; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63
+; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11]
+; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3
+; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61
+; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15]
; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80
+; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0
+; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15
+; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65
-; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80
-; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1
+; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: s_sext_i32_i16 s1, s1
; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7
; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6
-; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9
-; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8
-; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11
-; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10
-; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15
-; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70
-; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3
-; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2
-; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5
-; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61
+; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7]
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7
+; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s54
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s53
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58
-; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3
-; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55
-; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19]
+; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s50
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s49
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1
+; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s46
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s55
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18
+; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s49
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s48
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 64
@@ -7033,104 +7031,102 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0
+; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT: s_mov_b32 s16, s15
-; GCN-HSA-NEXT: s_mov_b32 s18, s13
-; GCN-HSA-NEXT: s_mov_b32 s20, s11
-; GCN-HSA-NEXT: s_mov_b32 s22, s9
-; GCN-HSA-NEXT: s_lshr_b32 s24, s14, 16
-; GCN-HSA-NEXT: s_lshr_b32 s26, s12, 16
-; GCN-HSA-NEXT: s_lshr_b32 s28, s10, 16
-; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 16
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000
-; GCN-HSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 48
-; GCN-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 48
+; GCN-HSA-NEXT: s_mov_b32 s6, s19
+; GCN-HSA-NEXT: s_mov_b32 s10, s17
+; GCN-HSA-NEXT: s_mov_b32 s20, s15
+; GCN-HSA-NEXT: s_mov_b32 s22, s13
+; GCN-HSA-NEXT: s_lshr_b32 s24, s18, 16
+; GCN-HSA-NEXT: s_lshr_b32 s26, s16, 16
+; GCN-HSA-NEXT: s_lshr_b32 s28, s14, 16
+; GCN-HSA-NEXT: s_lshr_b32 s30, s12, 16
+; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[18:19], 0x100000
+; GCN-HSA-NEXT: s_ashr_i64 s[18:19], s[18:19], 48
+; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[16:17], 0x100000
; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48
; GCN-HSA-NEXT: s_ashr_i64 s[14:15], s[14:15], 48
-; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000
+; GCN-HSA-NEXT: s_ashr_i64 s[16:17], s[16:17], 48
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19
+; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[28:29], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14
-; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15
-; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_nop 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12
-; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
-; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_nop 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
+; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
+; GCN-HSA-NEXT: s_add_u32 s28, s0, 0x70
+; GCN-HSA-NEXT: s_addc_u32 s29, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
+; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x50
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s28
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s29
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s11
; GCN-HSA-NEXT: s_add_u32 s10, s0, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s17
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GCN-HSA-NEXT: s_add_u32 s10, s0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_nop 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8
-; GCN-HSA-NEXT: s_add_u32 s8, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9
-; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
+; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
-; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x60
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x60
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
+; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
-; GCN-HSA-NEXT: s_add_u32 s8, s0, 64
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: s_nop 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT: s_add_u32 s8, s0, 64
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
-; GCN-HSA-NEXT: s_add_u32 s6, s0, 32
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
-; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: s_add_u32 s4, s0, 32
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -7403,106 +7399,108 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s1, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s7, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s9, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s11, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s15, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s14, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s12, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s10, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s8, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s6, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s4, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s2, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16
-; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s3, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s12, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s8, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s6, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s4, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s2, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s0, 16
+; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s0, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s2, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s1, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s3, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
@@ -7513,141 +7511,142 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT: s_lshr_b32 s19, s1, 16
-; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 16
-; GCN-HSA-NEXT: s_lshr_b32 s21, s5, 16
-; GCN-HSA-NEXT: s_lshr_b32 s22, s7, 16
-; GCN-HSA-NEXT: s_lshr_b32 s23, s9, 16
-; GCN-HSA-NEXT: s_lshr_b32 s24, s11, 16
-; GCN-HSA-NEXT: s_lshr_b32 s25, s13, 16
-; GCN-HSA-NEXT: s_lshr_b32 s26, s15, 16
-; GCN-HSA-NEXT: s_lshr_b32 s27, s14, 16
-; GCN-HSA-NEXT: s_lshr_b32 s28, s12, 16
-; GCN-HSA-NEXT: s_lshr_b32 s29, s10, 16
-; GCN-HSA-NEXT: s_lshr_b32 s30, s8, 16
-; GCN-HSA-NEXT: s_lshr_b32 s31, s6, 16
-; GCN-HSA-NEXT: s_lshr_b32 s33, s4, 16
-; GCN-HSA-NEXT: s_lshr_b32 s34, s2, 16
+; GCN-HSA-NEXT: s_lshr_b32 s20, s1, 16
+; GCN-HSA-NEXT: s_lshr_b32 s21, s3, 16
+; GCN-HSA-NEXT: s_lshr_b32 s22, s5, 16
+; GCN-HSA-NEXT: s_lshr_b32 s23, s7, 16
+; GCN-HSA-NEXT: s_lshr_b32 s24, s9, 16
+; GCN-HSA-NEXT: s_lshr_b32 s25, s11, 16
+; GCN-HSA-NEXT: s_lshr_b32 s26, s13, 16
+; GCN-HSA-NEXT: s_lshr_b32 s27, s15, 16
+; GCN-HSA-NEXT: s_lshr_b32 s28, s14, 16
+; GCN-HSA-NEXT: s_lshr_b32 s29, s12, 16
+; GCN-HSA-NEXT: s_lshr_b32 s30, s10, 16
+; GCN-HSA-NEXT: s_lshr_b32 s31, s8, 16
+; GCN-HSA-NEXT: s_lshr_b32 s33, s6, 16
+; GCN-HSA-NEXT: s_lshr_b32 s34, s4, 16
+; GCN-HSA-NEXT: s_lshr_b32 s19, s2, 16
; GCN-HSA-NEXT: s_lshr_b32 s18, s0, 16
; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s35, s2, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s35, s4, 0xffff
; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff
; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff
; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff
; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff
; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff
; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s36, s3, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s36, s5, 0xffff
; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff
; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff
; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff
; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s2, s15, 0xffff
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xf0
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xd0
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xb0
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x90
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26
+; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x70
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x50
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x50
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 48
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xe0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xc0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xa0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x60
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x80
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 64
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x60
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s16, 32
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33
-; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 64
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: s_nop 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s16, 32
+; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
@@ -8091,144 +8090,140 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15
; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s9
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s5
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s3
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s50, s11
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s52, s9
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s54, s5
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3
; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s1
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s14, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s12, 16
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s8, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s4, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s2, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s0, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[20:21], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[30:31], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[22:23], 0x100000
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s8, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[70:71], s[18:19], 0x100000
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s6, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s4, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s2, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s0, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[60:61], s[0:1], 48
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[8:9], 48
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[12:13], 48
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[14:15], 48
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[14:15], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[48:49], s[0:1], 48
+; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[58:59], s[2:3], 48
+; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 48
+; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[8:9], 48
; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48
+; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[12:13], 48
+; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[14:15], 48
+; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s64
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s65
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s62
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s63
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s68
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s69
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s66
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s67
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s2
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s3
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s70
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s71
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s68
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s69
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[56:57], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[52:53], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[50:51], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[54:55], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[52:53], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[46:47], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[42:43], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s17
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s50
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s51
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[36:37], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s74
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s75
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s72
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s73
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s58
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s60
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s48
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s59
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s56
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s57
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s54
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s55
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s50
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s51
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s40
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s41
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s39
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s22
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s23
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s19
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s9
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s7
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
@@ -8237,13 +8232,13 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT: s_mov_b32 s42, s15
+; GCN-HSA-NEXT: s_mov_b32 s40, s15
; GCN-HSA-NEXT: s_mov_b32 s48, s13
; GCN-HSA-NEXT: s_mov_b32 s50, s11
; GCN-HSA-NEXT: s_mov_b32 s52, s9
; GCN-HSA-NEXT: s_mov_b32 s54, s7
; GCN-HSA-NEXT: s_mov_b32 s56, s5
-; GCN-HSA-NEXT: s_mov_b32 s46, s3
+; GCN-HSA-NEXT: s_mov_b32 s44, s3
; GCN-HSA-NEXT: s_mov_b32 s58, s1
; GCN-HSA-NEXT: s_lshr_b32 s60, s14, 16
; GCN-HSA-NEXT: s_lshr_b32 s62, s12, 16
@@ -8258,15 +8253,15 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48
; GCN-HSA-NEXT: s_ashr_i64 s[38:39], s[2:3], 48
; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[14:15], 48
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[42:43], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[40:41], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000
-; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48
-; GCN-HSA-NEXT: s_ashr_i64 s[44:45], s[6:7], 48
+; GCN-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 48
+; GCN-HSA-NEXT: s_ashr_i64 s[46:47], s[6:7], 48
; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48
; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48
; GCN-HSA-NEXT: s_ashr_i64 s[80:81], s[12:13], 48
@@ -8282,8 +8277,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[58:59], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000
@@ -8299,84 +8294,82 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49
; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xb0
; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49
-; GCN-HSA-NEXT: s_add_u32 s48, s16, 0x90
-; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s44
-; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45
-; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58
-; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45
-; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x50
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s46
+; GCN-HSA-NEXT: s_add_u32 s46, s16, 0x90
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s59
-; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47
+; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s42
+; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x70
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s43
+; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v30, s42
+; GCN-HSA-NEXT: v_mov_b32_e32 v31, s43
+; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x50
+; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38
; GCN-HSA-NEXT: s_add_u32 s38, s16, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39
-; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s48
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36
-; GCN-HSA-NEXT: s_add_u32 s36, s16, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s49
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55
-; GCN-HSA-NEXT: v_mov_b32_e32 v32, s44
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38
-; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s36
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39
+; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39
+; GCN-HSA-NEXT: s_add_u32 s38, s16, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57
-; GCN-HSA-NEXT: v_mov_b32_e32 v33, s45
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39
-; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s37
-; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s43
+; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14
; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15
; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_nop 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12
+; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12
; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13
; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_nop 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s38
+; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s41
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s39
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s35
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s31
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xa0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
@@ -8441,208 +8434,211 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, s1
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s1, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[14:15], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[20:21], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[26:27], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s5
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s9
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s11
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[12:13], 0x100000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s13
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[14:15], 0x100000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s15
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s0, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s1, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s2, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s34, s3
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s3, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s5
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s5, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s7
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s7, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s9
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s9, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s60, s11
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s12, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s68, s13
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s13, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s74, s14, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s76, s15
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s78, s15, 16
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s9, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[12:13], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s13, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s15, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xf0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s66
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s67
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15
-; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xe0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s62
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_nop 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12
-; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xd0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[72:73], s[14:15], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[22:23], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[30:31], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[36:37], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[40:41], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[42:43], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[46:47], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[48:49], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[50:51], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[54:55], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[58:59], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[60:61], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[62:63], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[66:67], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[68:69], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[70:71], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[74:75], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[76:77], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[78:79], 0x100000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s60
+; GCN-NOHSA-VI-NEXT: s_add_u32 s60, s16, 0xf0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13
-; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xc0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s61, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57
+; GCN-NOHSA-VI-NEXT: s_nop 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58
+; GCN-NOHSA-VI-NEXT: s_add_u32 s58, s16, 0xe0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s59, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s58
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s72
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s73
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s59
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_nop 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s16, 0xb0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54
+; GCN-NOHSA-VI-NEXT: s_add_u32 s54, s16, 0xd0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s16, 0xa0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s55, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s54
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s55
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51
+; GCN-NOHSA-VI-NEXT: s_nop 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52
+; GCN-NOHSA-VI-NEXT: s_add_u32 s52, s16, 0xc0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s53, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s52
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s64
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s65
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s53
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_nop 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
-; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s16, 0x90
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48
+; GCN-NOHSA-VI-NEXT: s_add_u32 s48, s16, 0xb0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9
-; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s16, 0x80
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s49, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s49
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45
+; GCN-NOHSA-VI-NEXT: s_nop 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38
+; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0xa0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39
+; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0x90
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_nop 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s16, 0x70
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s16, 0x60
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39
+; GCN-NOHSA-VI-NEXT: s_nop 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24
+; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x80
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25
+; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x70
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_nop 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_nop 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x60
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s21
+; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x50
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s21
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_nop 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 64
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_nop 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s17
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 6eeaec12c3d148..341332e60b5c0d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -2713,37 +2713,39 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s9
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s5
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s2
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s3
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s36
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s33
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:96
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s31
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s30
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:80
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s0
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s1
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s29
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s28
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:64
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s27
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s26
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[16:19], 0 offset:48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s25
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s24
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[16:19], 0 offset:32
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s23
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s22
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[16:19], 0 offset:16
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s23
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s22
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s21
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s20
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0
; GFX6-NOHSA-NEXT: s_endpgm
;
; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64:
@@ -2752,97 +2754,91 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_ashr_i32 s18, s1, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s19, s0, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s20, s3, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s21, s2, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s22, s5, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s23, s4, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s24, s7, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s25, s6, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s26, s9, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s27, s8, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s28, s11, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s29, s10, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s30, s13, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s31, s12, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s33, s15, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s34, s14, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s20, s1, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s21, s0, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s22, s3, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s23, s2, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s24, s5, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s25, s4, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s26, s7, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s27, s6, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s28, s9, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s29, s8, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s30, s11, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s31, s10, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s33, s13, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s34, s12, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s35, s15, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s36, s14, 31
+; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70
+; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s19
+; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60
+; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s19
+; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50
+; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX7-HSA-NEXT: s_add_u32 s14, s16, 0x70
+; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13
-; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s33
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[13:14], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX7-HSA-NEXT: s_add_u32 s10, s16, 0x50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11
-; GFX7-HSA-NEXT: s_addc_u32 s11, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-HSA-NEXT: s_add_u32 s8, s16, 64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9
-; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6]
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7
; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s19
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s31
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s30
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s29
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s28
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s14
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[9:12]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
;
@@ -3500,137 +3496,135 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GFX6-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64:
; GFX6-NOHSA: ; %bb.0:
-; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9
+; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10
; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000
; GFX6-NOHSA-NEXT: s_mov_b32 s38, -1
-; GFX6-NOHSA-NEXT: s_mov_b32 s36, s16
-; GFX6-NOHSA-NEXT: s_mov_b32 s37, s17
-; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[18:19], 0x10
+; GFX6-NOHSA-NEXT: s_mov_b32 s36, s0
+; GFX6-NOHSA-NEXT: s_mov_b32 s37, s1
+; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s1, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s0, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s3, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s40, s2, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s41, s5, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s42, s4, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s7, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s6, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s45, s17, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s46, s16, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s47, s19, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s48, s18, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s21, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s50, s20, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s51, s23, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s30, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s31, 31
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s52
-; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s28, 31
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s53
-; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s29, 31
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s52
-; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s26, 31
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s53
-; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s27, 31
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s52
-; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s22, 31
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s53
-; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s25, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s17, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s16, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s19, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s40, s18, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s41, s21, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s42, s20, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s30, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s31, 31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43
+; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s28, 31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s44
+; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s29, 31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s43
+; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s23, 31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s44
+; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s22, 31
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s30
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s31
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s28
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s29
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s26
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s27
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s25
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s22
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s21
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:224
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s16
+; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s25, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s27, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s26, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s24, 31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s27
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s25
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s21
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s19
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s17
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s20
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s18
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:208
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s17
-; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s24, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s9, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s8, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s11, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s10, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s13, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s12, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s15, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s14, 31
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s16
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s53
+; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s1, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s0, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s2, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s5, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s4, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s7, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s6, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s26, s9, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s8, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s28, s11, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s10, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s30, s13, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s12, 31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s22
+; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s15, 31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s16
+; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s14, 31
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:192
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s14
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s15
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s52
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s51
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s14
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s15
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s44
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s43
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:176
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s12
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s13
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s50
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s49
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s47
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s40
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:144
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s9
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s46
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s45
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:128
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s6
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s7
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s23
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:112
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s34
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s33
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:128
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s16
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s22
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:112
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s5
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s22
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s21
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:96
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s30
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:96
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s2
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s3
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s20
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s19
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s2
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s3
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s18
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s27
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s26
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:64
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s44
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s42
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s41
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:32
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s40
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:16
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s34
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s24
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:48
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s23
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:32
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s20
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s19
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:16
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s18
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0
; GFX6-NOHSA-NEXT: s_endpgm
;
@@ -3646,45 +3640,45 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: s_ashr_i32 s23, s2, 31
; GFX7-HSA-NEXT: s_ashr_i32 s24, s5, 31
; GFX7-HSA-NEXT: s_ashr_i32 s25, s4, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s26, s7, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s27, s6, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s28, s9, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s29, s8, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s30, s11, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s31, s10, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s33, s13, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s34, s12, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s35, s15, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s36, s14, 31
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s13
+; GFX7-HSA-NEXT: s_ashr_i32 s28, s7, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s29, s6, 31
+; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX7-HSA-NEXT: s_ashr_i32 s36, s9, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s37, s8, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s38, s11, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s39, s10, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s40, s13, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s41, s12, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s42, s15, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s43, s14, 31
+; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s13
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s11
-; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s5
-; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s2
-; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s4
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s1
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s33
+; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s43
+; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42
+; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s41
+; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s40
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_ashr_i32 s37, s1, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s38, s0, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s39, s3, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s40, s2, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s41, s5, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s42, s4, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s43, s7, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s18, s1, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s19, s0, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s26, s3, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s27, s2, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s30, s5, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s31, s4, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31
; GFX7-HSA-NEXT: s_ashr_i32 s44, s6, 31
; GFX7-HSA-NEXT: s_ashr_i32 s45, s9, 31
; GFX7-HSA-NEXT: s_ashr_i32 s46, s8, 31
@@ -3694,105 +3688,101 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: s_ashr_i32 s50, s12, 31
; GFX7-HSA-NEXT: s_ashr_i32 s51, s15, 31
; GFX7-HSA-NEXT: s_ashr_i32 s52, s14, 31
-; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xf0
-; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s19
-; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s18
-; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xe0
-; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s19
-; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s18
-; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xd0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
-; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xc0
-; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19
-; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xb0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s31
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s30
-; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s29
-; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s28
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[4:7]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15]
-; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s27
-; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s26
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
-; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xa0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s25
-; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
-; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x90
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
-; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s23
-; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s22
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
-; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x80
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
-; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s21
-; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX7-HSA-NEXT: s_add_u32 s14, s16, 0x70
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
-; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14
+; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xf0
+; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s35
+; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xe0
+; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s35
+; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xd0
+; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s35
+; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s34
+; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xc0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[27:30]
+; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s35
+; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xb0
+; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s35
+; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xa0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[23:26]
+; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s35
+; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x90
+; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28
+; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x80
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29
+; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s28
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s29
+; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x70
+; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0
+; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x60
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s25
+; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38
+; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s25
+; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s34
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[35:36], v[8:11]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28
+; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s35
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29
+; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s23
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s22
+; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s21
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[16:19]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15]
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s52
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s20
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13
-; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s49
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s25
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[20:23]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s49
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX7-HSA-NEXT: s_add_u32 s10, s16, 0x50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11
-; GFX7-HSA-NEXT: s_addc_u32 s11, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-HSA-NEXT: s_add_u32 s8, s16, 64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9
-; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s46
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s48
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s47
+; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[3:6]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s45
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[6:9]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12]
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7
; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -3801,8 +3791,8 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s42
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -3811,15 +3801,15 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s40
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s39
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s38
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
@@ -4193,43 +4183,37 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40
; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0
; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-HSA-NEXT: s_ashr_i32 s65, s31, 31
-; GFX9-HSA-NEXT: s_ashr_i32 s66, s30, 31
-; GFX9-HSA-NEXT: s_ashr_i32 s63, s29, 31
-; GFX9-HSA-NEXT: s_ashr_i32 s64, s28, 31
+; GFX9-HSA-NEXT: s_ashr_i32 s58, s30, 31
+; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s58
+; GFX9-HSA-NEXT: s_ashr_i32 s58, s31, 31
+; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s58
+; GFX9-HSA-NEXT: s_ashr_i32 s58, s28, 31
+; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s58
+; GFX9-HSA-NEXT: s_ashr_i32 s58, s29, 31
+; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s58
+; GFX9-HSA-NEXT: s_ashr_i32 s58, s26, 31
+; GFX9-HSA-NEXT: v_mov_b32_e32 v8, s58
+; GFX9-HSA-NEXT: s_ashr_i32 s58, s27, 31
+; GFX9-HSA-NEXT: v_mov_b32_e32 v10, s58
+; GFX9-HSA-NEXT: s_ashr_i32 s58, s24, 31
+; GFX9-HSA-NEXT: v_mov_b32_e32 v11, s58
+; GFX9-HSA-NEXT: s_ashr_i32 s58, s25, 31
; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s30
-; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s66
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s31
-; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s65
-; GFX9-HSA-NEXT: s_ashr_i32 s61, s27, 31
-; GFX9-HSA-NEXT: s_ashr_i32 s62, s26, 31
-; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:240
-; GFX9-HSA-NEXT: s_ashr_i32 s59, s25, 31
-; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s28
-; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s64
-; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s29
-; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s63
-; GFX9-HSA-NEXT: s_ashr_i32 s60, s24, 31
-; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:224
; GFX9-HSA-NEXT: s_ashr_i32 s57, s23, 31
-; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s26
-; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s62
-; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s27
-; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s61
+; GFX9-HSA-NEXT: v_mov_b32_e32 v13, s58
; GFX9-HSA-NEXT: s_ashr_i32 s58, s22, 31
-; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:208
+; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:240
+; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s29
+; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s28
; GFX9-HSA-NEXT: s_ashr_i32 s55, s21, 31
-; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s24
-; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s60
-; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s25
-; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s59
; GFX9-HSA-NEXT: s_ashr_i32 s56, s20, 31
-; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:192
-; GFX9-HSA-NEXT: s_ashr_i32 s53, s19, 31
+; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[4:7], s[36:37] offset:224
; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s22
; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s58
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s23
; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s57
+; GFX9-HSA-NEXT: s_ashr_i32 s53, s19, 31
; GFX9-HSA-NEXT: s_ashr_i32 s54, s18, 31
; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:176
; GFX9-HSA-NEXT: s_ashr_i32 s51, s17, 31
@@ -4294,14 +4278,18 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s5
; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s39
; GFX9-HSA-NEXT: s_ashr_i32 s34, s0, 31
+; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s26
+; GFX9-HSA-NEXT: v_mov_b32_e32 v9, s27
; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:32
-; GFX9-HSA-NEXT: s_nop 0
+; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[7:10], s[36:37] offset:208
; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s2
; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s38
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3
; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s35
+; GFX9-HSA-NEXT: v_mov_b32_e32 v10, s24
+; GFX9-HSA-NEXT: v_mov_b32_e32 v12, s25
; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:16
-; GFX9-HSA-NEXT: s_nop 0
+; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[10:13], s[36:37] offset:192
; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s0
; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s34
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s1
@@ -4496,64 +4484,64 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0
+; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xe0
+; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xd0
+; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0
+; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xc0
+; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s0
+; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xb0
+; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1
; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10
-; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0
-; GFX7-HSA-NEXT: s_add_u32 s34, s36, 0xf0
-; GFX7-HSA-NEXT: s_addc_u32 s35, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s35
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0
+; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xa0
+; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0
+; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90
+; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0
+; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28
-; GFX7-HSA-NEXT: s_add_u32 s28, s36, 0xe0
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29
-; GFX7-HSA-NEXT: s_addc_u32 s29, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s28
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s29
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26
-; GFX7-HSA-NEXT: s_add_u32 s26, s36, 0xd0
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27
-; GFX7-HSA-NEXT: s_addc_u32 s27, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s26
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s27
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24
-; GFX7-HSA-NEXT: s_add_u32 s24, s36, 0xc0
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25
-; GFX7-HSA-NEXT: s_addc_u32 s25, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22
-; GFX7-HSA-NEXT: s_add_u32 s22, s36, 0xb0
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23
-; GFX7-HSA-NEXT: s_addc_u32 s23, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20
-; GFX7-HSA-NEXT: s_add_u32 s20, s36, 0xa0
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21
-; GFX7-HSA-NEXT: s_addc_u32 s21, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
-; GFX7-HSA-NEXT: s_add_u32 s18, s36, 0x90
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19
-; GFX7-HSA-NEXT: s_addc_u32 s19, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16
; GFX7-HSA-NEXT: s_add_u32 s16, s36, 0x80
@@ -4562,7 +4550,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
+; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
; GFX7-HSA-NEXT: s_add_u32 s14, s36, 0x70
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
@@ -5111,53 +5099,52 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
;
; GFX7-HSA-LABEL: constant_load_v32i32:
; GFX7-HSA: ; %bb.0:
-; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10
-; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0
-; GFX7-HSA-NEXT: s_add_u32 s34, s36, 0x70
-; GFX7-HSA-NEXT: s_addc_u32 s35, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s35
+; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24
-; GFX7-HSA-NEXT: s_add_u32 s24, s36, 0x60
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25
-; GFX7-HSA-NEXT: s_addc_u32 s25, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s26
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s27
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s3
+; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70
+; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19
+; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50
; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20
-; GFX7-HSA-NEXT: s_add_u32 s20, s36, 0x50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21
-; GFX7-HSA-NEXT: s_addc_u32 s21, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16
-; GFX7-HSA-NEXT: s_add_u32 s16, s36, 64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17
-; GFX7-HSA-NEXT: s_addc_u32 s17, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
+; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX7-HSA-NEXT: s_add_u32 s18, s16, 64
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
+; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX7-HSA-NEXT: s_add_u32 s12, s36, 48
+; GFX7-HSA-NEXT: s_add_u32 s12, s16, 48
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
-; GFX7-HSA-NEXT: s_addc_u32 s13, s37, 0
+; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15
@@ -5165,9 +5152,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-HSA-NEXT: s_add_u32 s8, s36, 32
+; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX7-HSA-NEXT: s_addc_u32 s9, s37, 0
+; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11
@@ -5175,20 +5162,20 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-HSA-NEXT: s_add_u32 s4, s36, 16
+; GFX7-HSA-NEXT: s_add_u32 s4, s16, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-HSA-NEXT: s_addc_u32 s5, s37, 0
+; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s36
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 102c33ec31b09d..b3e75e767ae641 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -638,53 +638,52 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
;
; GFX7-LABEL: constant_load_v16i64:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0
+; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10
-; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0
-; GFX7-NEXT: s_add_u32 s34, s36, 0x70
-; GFX7-NEXT: s_addc_u32 s35, s37, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, s34
-; GFX7-NEXT: v_mov_b32_e32 v6, s35
+; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s28
-; GFX7-NEXT: v_mov_b32_e32 v1, s29
-; GFX7-NEXT: v_mov_b32_e32 v2, s30
-; GFX7-NEXT: v_mov_b32_e32 v3, s31
-; GFX7-NEXT: v_mov_b32_e32 v4, s24
-; GFX7-NEXT: s_add_u32 s24, s36, 0x60
-; GFX7-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
-; GFX7-NEXT: v_mov_b32_e32 v5, s25
-; GFX7-NEXT: s_addc_u32 s25, s37, 0
-; GFX7-NEXT: v_mov_b32_e32 v0, s24
-; GFX7-NEXT: v_mov_b32_e32 v6, s26
-; GFX7-NEXT: v_mov_b32_e32 v7, s27
-; GFX7-NEXT: v_mov_b32_e32 v1, s25
+; GFX7-NEXT: v_mov_b32_e32 v0, s12
+; GFX7-NEXT: v_mov_b32_e32 v1, s13
+; GFX7-NEXT: v_mov_b32_e32 v2, s14
+; GFX7-NEXT: v_mov_b32_e32 v3, s15
+; GFX7-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-NEXT: v_mov_b32_e32 v5, s9
+; GFX7-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-NEXT: v_mov_b32_e32 v7, s11
+; GFX7-NEXT: v_mov_b32_e32 v8, s4
+; GFX7-NEXT: v_mov_b32_e32 v9, s5
+; GFX7-NEXT: v_mov_b32_e32 v10, s6
+; GFX7-NEXT: v_mov_b32_e32 v11, s7
+; GFX7-NEXT: v_mov_b32_e32 v12, s0
+; GFX7-NEXT: v_mov_b32_e32 v13, s1
+; GFX7-NEXT: v_mov_b32_e32 v14, s2
+; GFX7-NEXT: v_mov_b32_e32 v15, s3
+; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GFX7-NEXT: s_add_u32 s18, s16, 0x70
+; GFX7-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-NEXT: v_mov_b32_e32 v16, s18
+; GFX7-NEXT: v_mov_b32_e32 v17, s19
+; GFX7-NEXT: s_add_u32 s18, s16, 0x60
+; GFX7-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GFX7-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s18
+; GFX7-NEXT: v_mov_b32_e32 v1, s19
+; GFX7-NEXT: s_add_u32 s18, s16, 0x50
; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, s20
-; GFX7-NEXT: s_add_u32 s20, s36, 0x50
-; GFX7-NEXT: v_mov_b32_e32 v1, s21
-; GFX7-NEXT: s_addc_u32 s21, s37, 0
-; GFX7-NEXT: v_mov_b32_e32 v4, s20
-; GFX7-NEXT: v_mov_b32_e32 v2, s22
-; GFX7-NEXT: v_mov_b32_e32 v3, s23
-; GFX7-NEXT: v_mov_b32_e32 v5, s21
-; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-NEXT: s_nop 0
-; GFX7-NEXT: v_mov_b32_e32 v0, s16
-; GFX7-NEXT: s_add_u32 s16, s36, 64
-; GFX7-NEXT: v_mov_b32_e32 v1, s17
-; GFX7-NEXT: s_addc_u32 s17, s37, 0
-; GFX7-NEXT: v_mov_b32_e32 v4, s16
-; GFX7-NEXT: v_mov_b32_e32 v2, s18
-; GFX7-NEXT: v_mov_b32_e32 v3, s19
-; GFX7-NEXT: v_mov_b32_e32 v5, s17
-; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-NEXT: s_nop 0
+; GFX7-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s18
+; GFX7-NEXT: v_mov_b32_e32 v1, s19
+; GFX7-NEXT: s_add_u32 s18, s16, 64
+; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; GFX7-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s18
+; GFX7-NEXT: v_mov_b32_e32 v1, s19
+; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s12
-; GFX7-NEXT: s_add_u32 s12, s36, 48
+; GFX7-NEXT: s_add_u32 s12, s16, 48
; GFX7-NEXT: v_mov_b32_e32 v1, s13
-; GFX7-NEXT: s_addc_u32 s13, s37, 0
+; GFX7-NEXT: s_addc_u32 s13, s17, 0
; GFX7-NEXT: v_mov_b32_e32 v4, s12
; GFX7-NEXT: v_mov_b32_e32 v2, s14
; GFX7-NEXT: v_mov_b32_e32 v3, s15
@@ -692,9 +691,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-NEXT: s_nop 0
; GFX7-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-NEXT: s_add_u32 s8, s36, 32
+; GFX7-NEXT: s_add_u32 s8, s16, 32
; GFX7-NEXT: v_mov_b32_e32 v1, s9
-; GFX7-NEXT: s_addc_u32 s9, s37, 0
+; GFX7-NEXT: s_addc_u32 s9, s17, 0
; GFX7-NEXT: v_mov_b32_e32 v4, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s10
; GFX7-NEXT: v_mov_b32_e32 v3, s11
@@ -702,20 +701,20 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-NEXT: s_nop 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: s_add_u32 s4, s36, 16
+; GFX7-NEXT: s_add_u32 s4, s16, 16
; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_addc_u32 s5, s37, 0
+; GFX7-NEXT: s_addc_u32 s5, s17, 0
; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-NEXT: v_mov_b32_e32 v4, s36
+; GFX7-NEXT: v_mov_b32_e32 v4, s16
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: v_mov_b32_e32 v5, s37
+; GFX7-NEXT: v_mov_b32_e32 v5, s17
; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index ff55ab8859c833..efc31fbd5ed9ee 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -2391,48 +2391,48 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_bfe_u32 s23, s9, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s24, s10, 24
; GFX7-HSA-NEXT: s_bfe_u32 s25, s10, 0x80008
-; GFX7-HSA-NEXT: s_lshr_b32 s2, s11, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s3, s11, 0x80008
-; GFX7-HSA-NEXT: s_and_b32 s26, s4, 0xff
+; GFX7-HSA-NEXT: s_lshr_b32 s26, s11, 24
+; GFX7-HSA-NEXT: s_bfe_u32 s27, s11, 0x80008
+; GFX7-HSA-NEXT: s_and_b32 s28, s4, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s27, s5, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s29, s5, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s28, s6, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s30, s6, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s29, s7, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s31, s7, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s30, s8, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s33, s8, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s31, s9, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s34, s9, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s33, s10, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s35, s10, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s34, s11, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s36, s11, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s35
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s24
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22
@@ -2441,7 +2441,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20
@@ -2450,7 +2450,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18
@@ -2459,21 +2459,21 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12
@@ -2880,33 +2880,33 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_ashr_i32 s30, s10, 24
; GFX7-HSA-NEXT: s_bfe_i32 s31, s10, 0x80010
; GFX7-HSA-NEXT: s_bfe_i32 s33, s10, 0x80008
-; GFX7-HSA-NEXT: s_ashr_i32 s2, s11, 24
-; GFX7-HSA-NEXT: s_bfe_i32 s3, s11, 0x80010
-; GFX7-HSA-NEXT: s_bfe_i32 s34, s11, 0x80008
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s2
+; GFX7-HSA-NEXT: s_ashr_i32 s34, s11, 24
+; GFX7-HSA-NEXT: s_bfe_i32 s35, s11, 0x80010
+; GFX7-HSA-NEXT: s_bfe_i32 s36, s11, 0x80008
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s34
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3
; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s33
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s31
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s30
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9
@@ -3281,32 +3281,32 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s2, 24
; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s2, 0x80008
; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s26, s3, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s4, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s28, s4, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s29, s5, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s30, s5, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s31, s6, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s33, s6, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s35, s7, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s8, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s37, s8, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s9, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s39, s9, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s10, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s41, s10, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s11, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s43, s11, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s12, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s45, s12, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s13, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s47, s13, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s14, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s49, s14, 0x80008
-; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s15, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s51, s15, 0x80008
-; GFX6-NOHSA-NEXT: s_and_b32 s52, s0, 0xff
+; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s3, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s29, s4, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s31, s5, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s33, s6, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s34, s6, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s35, s7, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s36, s7, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s37, s8, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s38, s8, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s39, s9, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s40, s9, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s41, s10, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s42, s10, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s43, s11, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s44, s11, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s45, s12, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s46, s12, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s47, s13, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s48, s13, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s49, s14, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s50, s14, 0x80008
+; GFX6-NOHSA-NEXT: s_lshr_b32 s51, s15, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s52, s15, 0x80008
+; GFX6-NOHSA-NEXT: s_and_b32 s26, s0, 0xff
; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s0, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s53, s1, 0xff
; GFX6-NOHSA-NEXT: s_bfe_u32 s54, s1, 0x80010
@@ -3327,92 +3327,91 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_and_b32 s64, s9, 0xff
; GFX6-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s65, s10, 0xff
-; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s66, s11, 0xff
+; GFX6-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s67, s12, 0xff
; GFX6-NOHSA-NEXT: s_bfe_u32 s12, s12, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s68, s13, 0xff
+; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s69, s14, 0xff
; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s14, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s70, s15, 0xff
; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s15, 0x80010
-; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010
-; GFX6-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010
+; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010
; GFX6-NOHSA-NEXT: s_mov_b32 s0, s16
; GFX6-NOHSA-NEXT: s_mov_b32 s1, s17
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s70
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s51
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s52
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s50
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s51
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s69
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s49
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s50
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s48
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s49
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s47
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s48
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s13
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s46
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s47
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s67
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s46
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s45
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s66
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s44
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s11
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s44
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s66
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s42
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s65
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s41
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s43
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s40
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s64
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s39
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s40
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s38
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s63
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s38
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s36
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s36
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s61
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s33
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s34
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s30
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s31
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s30
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s59
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s28
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s27
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s57
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s26
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s58
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
@@ -3429,7 +3428,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s25
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18
@@ -3455,25 +3454,25 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_lshr_b32 s31, s5, 24
; GFX7-HSA-NEXT: s_bfe_u32 s33, s5, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s35, s6, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s36, s6, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s37, s6, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s38, s7, 24
; GFX7-HSA-NEXT: s_bfe_u32 s39, s7, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s41, s8, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s42, s8, 0x80008
-; GFX7-HSA-NEXT: s_lshr_b32 s43, s9, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s44, s9, 0x80008
-; GFX7-HSA-NEXT: s_lshr_b32 s45, s10, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s46, s10, 0x80008
-; GFX7-HSA-NEXT: s_lshr_b32 s47, s11, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s48, s11, 0x80008
-; GFX7-HSA-NEXT: s_lshr_b32 s49, s12, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s50, s12, 0x80008
-; GFX7-HSA-NEXT: s_lshr_b32 s51, s13, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s52, s13, 0x80008
-; GFX7-HSA-NEXT: s_lshr_b32 s53, s14, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s54, s14, 0x80008
-; GFX7-HSA-NEXT: s_lshr_b32 s55, s15, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s56, s15, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s43, s8, 0x80008
+; GFX7-HSA-NEXT: s_lshr_b32 s44, s9, 24
+; GFX7-HSA-NEXT: s_bfe_u32 s46, s9, 0x80008
+; GFX7-HSA-NEXT: s_lshr_b32 s47, s10, 24
+; GFX7-HSA-NEXT: s_bfe_u32 s48, s10, 0x80008
+; GFX7-HSA-NEXT: s_lshr_b32 s49, s11, 24
+; GFX7-HSA-NEXT: s_bfe_u32 s50, s11, 0x80008
+; GFX7-HSA-NEXT: s_lshr_b32 s51, s12, 24
+; GFX7-HSA-NEXT: s_bfe_u32 s52, s12, 0x80008
+; GFX7-HSA-NEXT: s_lshr_b32 s53, s13, 24
+; GFX7-HSA-NEXT: s_bfe_u32 s54, s13, 0x80008
+; GFX7-HSA-NEXT: s_lshr_b32 s55, s14, 24
+; GFX7-HSA-NEXT: s_bfe_u32 s56, s14, 0x80008
+; GFX7-HSA-NEXT: s_lshr_b32 s57, s15, 24
+; GFX7-HSA-NEXT: s_bfe_u32 s58, s15, 0x80008
; GFX7-HSA-NEXT: s_and_b32 s24, s0, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX7-HSA-NEXT: s_and_b32 s27, s1, 0xff
@@ -3482,18 +3481,18 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_bfe_u32 s2, s2, 0x80010
; GFX7-HSA-NEXT: s_and_b32 s34, s3, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s3, s3, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s37, s4, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s36, s4, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX7-HSA-NEXT: s_and_b32 s40, s5, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s57, s6, 0xff
-; GFX7-HSA-NEXT: s_bfe_u32 s58, s6, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s59, s7, 0xff
-; GFX7-HSA-NEXT: s_bfe_u32 s60, s7, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s61, s8, 0xff
-; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010
-; GFX7-HSA-NEXT: s_and_b32 s62, s9, 0xff
-; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010
+; GFX7-HSA-NEXT: s_and_b32 s42, s6, 0xff
+; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX7-HSA-NEXT: s_and_b32 s45, s7, 0xff
+; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010
+; GFX7-HSA-NEXT: s_and_b32 s59, s8, 0xff
+; GFX7-HSA-NEXT: s_bfe_u32 s60, s8, 0x80010
+; GFX7-HSA-NEXT: s_and_b32 s61, s9, 0xff
+; GFX7-HSA-NEXT: s_bfe_u32 s62, s9, 0x80010
; GFX7-HSA-NEXT: s_and_b32 s63, s10, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010
; GFX7-HSA-NEXT: s_and_b32 s64, s11, 0xff
@@ -3506,97 +3505,97 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010
; GFX7-HSA-NEXT: s_and_b32 s68, s15, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xf0
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s6
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xe0
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xd0
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s6
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xc0
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s6
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xb0
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s6
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xa0
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s6
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x90
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s54
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s53
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x80
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s56
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s55
+; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0
+; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8
+; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0
+; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8
+; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0
+; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8
+; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0
+; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8
+; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0
+; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8
+; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s66
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s52
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s54
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s53
+; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11]
; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s65
-; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8
+; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s52
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s49
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51
+; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8
+; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80
+; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s58
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s57
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8
+; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
+; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s63
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s48
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11
-; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s47
-; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s61
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s42
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70
-; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s63
-; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s62
-; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s45
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7]
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s59
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s38
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s49
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s7
; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s57
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s58
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s56
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s55
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s61
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46
+; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s59
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s44
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6
; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s45
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40
@@ -3610,7 +3609,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64
; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s37
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
@@ -4235,16 +4234,17 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_bfe_i32 s54, s10, 0x80010
; GFX6-NOHSA-NEXT: s_bfe_i32 s55, s10, 0x80008
; GFX6-NOHSA-NEXT: s_sext_i32_i8 s10, s10
-; GFX6-NOHSA-NEXT: s_ashr_i32 s56, s11, 24
-; GFX6-NOHSA-NEXT: s_bfe_i32 s57, s11, 0x80010
-; GFX6-NOHSA-NEXT: s_bfe_i32 s58, s11, 0x80008
-; GFX6-NOHSA-NEXT: s_sext_i32_i8 s11, s11
-; GFX6-NOHSA-NEXT: s_bfe_i32 s59, s12, 0x80010
-; GFX6-NOHSA-NEXT: s_bfe_i32 s60, s12, 0x80008
-; GFX6-NOHSA-NEXT: s_sext_i32_i8 s61, s12
+; GFX6-NOHSA-NEXT: s_bfe_i32 s56, s11, 0x80010
+; GFX6-NOHSA-NEXT: s_bfe_i32 s57, s11, 0x80008
+; GFX6-NOHSA-NEXT: s_sext_i32_i8 s58, s11
+; GFX6-NOHSA-NEXT: s_ashr_i32 s59, s12, 24
+; GFX6-NOHSA-NEXT: s_bfe_i32 s60, s12, 0x80010
+; GFX6-NOHSA-NEXT: s_bfe_i32 s61, s12, 0x80008
+; GFX6-NOHSA-NEXT: s_sext_i32_i8 s12, s12
; GFX6-NOHSA-NEXT: s_ashr_i32 s62, s13, 24
; GFX6-NOHSA-NEXT: s_bfe_i32 s63, s13, 0x80010
; GFX6-NOHSA-NEXT: s_bfe_i32 s64, s13, 0x80008
+; GFX6-NOHSA-NEXT: s_sext_i32_i8 s13, s13
; GFX6-NOHSA-NEXT: s_ashr_i32 s65, s14, 24
; GFX6-NOHSA-NEXT: s_bfe_i32 s66, s14, 0x80010
; GFX6-NOHSA-NEXT: s_bfe_i32 s67, s14, 0x80008
@@ -4253,8 +4253,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_bfe_i32 s69, s15, 0x80010
; GFX6-NOHSA-NEXT: s_bfe_i32 s70, s15, 0x80008
; GFX6-NOHSA-NEXT: s_sext_i32_i8 s15, s15
-; GFX6-NOHSA-NEXT: s_sext_i32_i8 s13, s13
-; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s12, 24
+; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s11, 24
; GFX6-NOHSA-NEXT: s_mov_b32 s0, s16
; GFX6-NOHSA-NEXT: s_mov_b32 s1, s17
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
@@ -4271,25 +4270,24 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s64
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s63
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s62
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s61
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s60
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s59
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s61
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s57
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s56
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s12
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s11
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s58
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s57
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s56
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s11
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s55
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s54
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s53
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s9
@@ -4380,7 +4378,8 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_ashr_i32 s37, s6, 24
; GFX7-HSA-NEXT: s_bfe_i32 s38, s6, 0x80010
; GFX7-HSA-NEXT: s_bfe_i32 s39, s6, 0x80008
-; GFX7-HSA-NEXT: s_ashr_i32 s40, s7, 24
+; GFX7-HSA-NEXT: s_sext_i32_i8 s40, s6
+; GFX7-HSA-NEXT: s_ashr_i32 s6, s7, 24
; GFX7-HSA-NEXT: s_bfe_i32 s41, s7, 0x80010
; GFX7-HSA-NEXT: s_bfe_i32 s42, s7, 0x80008
; GFX7-HSA-NEXT: s_ashr_i32 s43, s8, 24
@@ -4411,104 +4410,103 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0
; GFX7-HSA-NEXT: s_sext_i32_i8 s50, s9
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9
+; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s62
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s9
-; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s8
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11]
+; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s59
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15]
+; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
+; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7
; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11
-; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12
-; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13
-; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15
-; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s9
-; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s67
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s66
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s62
-; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s59
-; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58
-; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s6
+; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70
; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s11
; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s56
; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s55
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s54
-; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s7
+; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19]
+; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s6
+; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60
+; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s52
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51
+; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s49
+; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s45
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6
+; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50
+; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s43
-; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70
-; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s53
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s51
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s49
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s48
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7]
-; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s42
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s41
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s40
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s42
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s41
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4
@@ -6819,80 +6817,82 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX6-NOHSA: ; %bb.0:
; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s7, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s12, s7
-; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s5, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s5, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s24, s5
-; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 8
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000
-; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[4:5], 56
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x80000
-; GFX6-NOHSA-NEXT: s_ashr_i64 s[40:41], s[6:7], 56
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
+; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s11, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s11, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s4, s11
+; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s10, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s10, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s10, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s9, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s9, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s26, s9
+; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s8, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s8, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s8, 8
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000
+; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[8:9], 56
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x80000
+; GFX6-NOHSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56
; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[4:5], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s40
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s12
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s13
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s11
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s36
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s37
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s25
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s10
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s11
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s7
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s15
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s15
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s18
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s19
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s22
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s23
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s5
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s9
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NOHSA-NEXT: s_endpgm
;
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64:
@@ -6901,26 +6901,30 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s12, s7, 8
-; GFX7-HSA-NEXT: s_mov_b32 s14, s7
-; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s20, s6, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 8
-; GFX7-HSA-NEXT: s_mov_b32 s26, s5
-; GFX7-HSA-NEXT: s_lshr_b32 s8, s4, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 8
+; GFX7-HSA-NEXT: s_mov_b32 s12, s7
+; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 8
+; GFX7-HSA-NEXT: s_mov_b32 s24, s5
+; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 16
; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24
; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8
; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
; GFX7-HSA-NEXT: s_ashr_i64 s[34:35], s[4:5], 56
; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000
-; GFX7-HSA-NEXT: s_ashr_i64 s[38:39], s[6:7], 56
+; GFX7-HSA-NEXT: s_ashr_i64 s[4:5], s[6:7], 56
+; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5
; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
@@ -6929,31 +6933,27 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x70
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11
-; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s39
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX7-HSA-NEXT: s_add_u32 s26, s0, 0x70
+; GFX7-HSA-NEXT: s_addc_u32 s27, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10
; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11
; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11
; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -6961,15 +6961,15 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
@@ -6977,10 +6977,10 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -7390,143 +7390,144 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX7-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s15, s7, 24
; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24
; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24
; GFX7-HSA-NEXT: s_lshr_b32 s18, s10, 24
; GFX7-HSA-NEXT: s_lshr_b32 s19, s11, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s20, s11, 0x80008
-; GFX7-HSA-NEXT: s_bfe_u32 s21, s10, 0x80008
-; GFX7-HSA-NEXT: s_bfe_u32 s22, s9, 0x80008
-; GFX7-HSA-NEXT: s_bfe_u32 s23, s8, 0x80008
-; GFX7-HSA-NEXT: s_bfe_u32 s24, s7, 0x80008
-; GFX7-HSA-NEXT: s_bfe_u32 s25, s6, 0x80008
-; GFX7-HSA-NEXT: s_bfe_u32 s26, s5, 0x80008
-; GFX7-HSA-NEXT: s_bfe_u32 s2, s4, 0x80008
-; GFX7-HSA-NEXT: s_and_b32 s3, s4, 0xff
-; GFX7-HSA-NEXT: s_and_b32 s27, s5, 0xff
-; GFX7-HSA-NEXT: s_and_b32 s28, s6, 0xff
-; GFX7-HSA-NEXT: s_and_b32 s29, s7, 0xff
-; GFX7-HSA-NEXT: s_and_b32 s30, s8, 0xff
-; GFX7-HSA-NEXT: s_and_b32 s31, s9, 0xff
-; GFX7-HSA-NEXT: s_and_b32 s33, s10, 0xff
-; GFX7-HSA-NEXT: s_and_b32 s34, s11, 0xff
-; GFX7-HSA-NEXT: s_bfe_u32 s35, s4, 0x80010
-; GFX7-HSA-NEXT: s_bfe_u32 s36, s5, 0x80010
-; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010
-; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010
+; GFX7-HSA-NEXT: s_lshr_b32 s20, s12, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s21, s13, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s22, s14, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s23, s15, 24
+; GFX7-HSA-NEXT: s_bfe_u32 s24, s15, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s25, s14, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s26, s13, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s27, s12, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s28, s11, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s29, s10, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s4, s9, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s2, s8, 0x80008
+; GFX7-HSA-NEXT: s_and_b32 s3, s8, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s5, s9, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s30, s10, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s31, s11, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s33, s12, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s34, s13, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s35, s14, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s36, s15, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010
; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010
; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010
-; GFX7-HSA-NEXT: s_bfe_u32 s4, s11, 0x80010
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xf0
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010
+; GFX7-HSA-NEXT: s_bfe_u32 s12, s12, 0x80010
+; GFX7-HSA-NEXT: s_bfe_u32 s13, s13, 0x80010
+; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010
+; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xf0
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xd0
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xb0
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s6
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x90
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xd0
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x50
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xb0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x90
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x70
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x50
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 48
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xe0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xc0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s35
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xe0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xa0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xc0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x80
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xa0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x80
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x60
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s0, 64
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX7-HSA-NEXT: s_nop 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4
; GFX7-HSA-NEXT: s_add_u32 s4, s0, 32
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5
; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -7976,74 +7977,85 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s7, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s7, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s50, s7
-; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s6, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s5, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s5, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s44, s5
-; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s3, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s3, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s40, s3
-; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s2, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s2, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s1, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s1, 8
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[50:51], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[44:45], 0x80000
-; GFX6-NOHSA-NEXT: s_mov_b32 s62, s1
-; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s64, s0, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s66, s0, 8
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[0:1], 0x80000
-; GFX6-NOHSA-NEXT: s_ashr_i64 s[50:51], s[0:1], 56
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[2:3], 0x80000
+; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s7, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s38, s7
+; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s6, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s6, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s42, s5
+; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s3, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s34, s3
+; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 16
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[42:43], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x80000
+; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s2, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s54, s1
+; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s0, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s52, s0, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 8
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000
+; GFX6-NOHSA-NEXT: s_ashr_i64 s[38:39], s[0:1], 56
+; GFX6-NOHSA-NEXT: s_ashr_i64 s[64:65], s[2:3], 56
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[4:5], 0x80000
; GFX6-NOHSA-NEXT: s_ashr_i64 s[68:69], s[4:5], 56
; GFX6-NOHSA-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x80000
; GFX6-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GFX6-NOHSA-NEXT: s_ashr_i64 s[54:55], s[2:3], 56
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x80000
; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8
; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s6
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s7
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s59
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s70
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s71
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s68
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s69
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s60
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s61
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s60
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s61
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s70
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s71
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s68
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s69
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s58
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s66
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s67
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s64
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s65
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s63
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[46:47], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[40:41], 0x80000
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s5
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s6
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s7
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:240
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[62:63], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[40:41], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[56:57], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[42:43], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s6
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s7
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s30
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s8
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s9
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:208
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[54:55], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[34:35], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[52:53], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[50:51], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[48:49], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[46:47], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[44:45], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
@@ -8052,81 +8064,65 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s13
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s54
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s55
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s14
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:192
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s49
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s17
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s54
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s55
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s53
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s18
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s19
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:160
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s12
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s13
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:176
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s14
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s15
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:160
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s16
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:144
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s22
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s23
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s50
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s51
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s20
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s21
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:128
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s46
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s47
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s26
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s27
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s22
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s23
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s10
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s24
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s25
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s44
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s28
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s29
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s30
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s42
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s43
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s36
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s37
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s34
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s35
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s36
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s38
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s39
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s40
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s41
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s34
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s30
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s28
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s9
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s4
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s5
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s7
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0
; GFX6-NOHSA-NEXT: s_endpgm
;
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64:
@@ -8136,33 +8132,33 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s14, s7, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s48, s7, 8
-; GFX7-HSA-NEXT: s_mov_b32 s50, s7
-; GFX7-HSA-NEXT: s_lshr_b32 s52, s6, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s58, s5, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s50, s7, 8
+; GFX7-HSA-NEXT: s_mov_b32 s52, s7
+; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s58, s6, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s48, s5, 8
; GFX7-HSA-NEXT: s_mov_b32 s62, s5
-; GFX7-HSA-NEXT: s_lshr_b32 s44, s4, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s42, s4, 16
; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 24
; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 8
; GFX7-HSA-NEXT: s_lshr_b32 s36, s3, 16
; GFX7-HSA-NEXT: s_lshr_b32 s30, s3, 8
; GFX7-HSA-NEXT: s_mov_b32 s34, s3
-; GFX7-HSA-NEXT: s_lshr_b32 s28, s2, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s24, s2, 24
; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s18, s1, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s20, s1, 16
; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 8
; GFX7-HSA-NEXT: s_mov_b32 s16, s1
; GFX7-HSA-NEXT: s_lshr_b32 s66, s0, 16
; GFX7-HSA-NEXT: s_lshr_b32 s68, s0, 24
; GFX7-HSA-NEXT: s_lshr_b32 s70, s0, 8
; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
-; GFX7-HSA-NEXT: s_ashr_i64 s[20:21], s[2:3], 56
-; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000
-; GFX7-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 56
+; GFX7-HSA-NEXT: s_ashr_i64 s[18:19], s[2:3], 56
+; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x80000
+; GFX7-HSA-NEXT: s_ashr_i64 s[44:45], s[4:5], 56
; GFX7-HSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56
; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000
@@ -8177,118 +8173,118 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[64:65], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
; GFX7-HSA-NEXT: s_add_u32 s64, s8, 0xf0
; GFX7-HSA-NEXT: s_addc_u32 s65, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s48
-; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xe0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s49
-; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s49
-; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xd0
-; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46
-; GFX7-HSA-NEXT: s_add_u32 s46, s8, 0xc0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47
-; GFX7-HSA-NEXT: s_addc_u32 s47, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s42
-; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xb0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s43
-; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s50
+; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xe0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s51
+; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s51
+; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xd0
+; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s44
+; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xc0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s45
+; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s44
; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s43
-; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s45
+; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xb0
; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s65
-; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0
+; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40
-; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0x90
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s51
-; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41
-; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s52
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s53
-; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s54
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s55
-; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s49
-; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s46
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s52
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42
+; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s53
+; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s43
+; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s54
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s55
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s56
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s57
+; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51
; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s56
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24
-; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80
-; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57
-; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s47
-; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s58
-; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s59
-; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s42
; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s40
-; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s28
+; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x90
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s29
+; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s28
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s42
+; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s29
+; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x80
; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s62
; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s63
-; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s43
-; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s61
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s44
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s45
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s41
+; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s49
+; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s44
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43
+; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s60
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s61
+; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s45
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s18
+; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x70
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s19
+; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19
+; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x60
+; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s59
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41
+; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s28
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s19
+; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x50
; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s29
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s38
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s25
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x70
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21
-; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21
-; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x60
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21
-; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x50
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s36
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s37
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s35
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s30
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s31
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
+; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
@@ -8308,8 +8304,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21
; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 64f1f45bf734cf..4217384cdd5ce7 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -8733,4 +8733,4 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; ret void
; }
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 8f6a1f8c01ec34..5ce8a2b5f862e1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -4645,4 +4645,4 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
ret void
}
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 018da7f81e3d4b..9f264de531950b 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -139,16 +139,16 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.2:
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
@@ -248,14 +248,14 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.2:
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -356,15 +356,15 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_22]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.2:
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -464,27 +464,27 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.3(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
; GFX908-NEXT: S_NOP 0
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.3:
; GFX908-NEXT: successors: %bb.4(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.4:
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -600,29 +600,29 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_22]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_24]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.3(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
; GFX908-NEXT: S_NOP 0
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.3:
; GFX908-NEXT: successors: %bb.4(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_25]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.4:
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -722,6 +722,7 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
@@ -742,8 +743,6 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX908-NEXT: {{ $}}
@@ -759,8 +758,8 @@ body: |
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.3(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.3:
; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000)
@@ -773,7 +772,8 @@ body: |
; GFX908-NEXT: bb.4:
; GFX908-NEXT: successors: %bb.1(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
; GFX908-NEXT: S_BRANCH %bb.1
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.5:
@@ -1114,14 +1114,6 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
@@ -1194,12 +1186,19 @@ body: |
; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
; GFX908-NEXT: [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
; GFX908-NEXT: [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
@@ -1216,13 +1215,14 @@ body: |
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.3(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.3:
; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
; GFX908-NEXT: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
; GFX908-NEXT: S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
@@ -1643,10 +1643,6 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
@@ -1719,6 +1715,10 @@ body: |
; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
@@ -2049,10 +2049,6 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
@@ -2125,9 +2121,13 @@ body: |
; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
; GFX908-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
; GFX908-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
@@ -2801,6 +2801,7 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
@@ -2822,7 +2823,6 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX908-NEXT: {{ $}}
@@ -2988,7 +2988,6 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX908-NEXT: {{ $}}
@@ -3004,9 +3003,10 @@ body: |
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.3(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.3:
; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000)
@@ -4974,20 +4974,20 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: undef [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
+ ; GFX908-NEXT: undef [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.2:
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -4998,9 +4998,9 @@ body: |
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_20]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]]
; GFX908-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
@@ -5192,13 +5192,13 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x80000000)
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
+ ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.2:
@@ -5297,7 +5297,6 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x80000000)
@@ -5305,6 +5304,7 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.2:
@@ -5726,17 +5726,17 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.2:
- ; GFX908-NEXT: DBG_VALUE [[V_CVT_I32_F64_e32_23]], 0, 0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: DBG_VALUE %23, 0, 0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
@@ -5836,17 +5836,17 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def %22, 327689 /* reguse:SReg_1_with_sub0 */, [[V_CVT_I32_F64_e32_4]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.2:
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 851c9bb02a3456..127656f7aa626c 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -589,6 +589,6 @@ declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly
declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
-attributes #0 = { minsize }
-attributes #1 = { optsize }
+attributes #0 = { minsize "amdgpu-flat-work-group-size"="1024,1024" }
+attributes #1 = { optsize "amdgpu-flat-work-group-size"="1024,1024" }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.mir b/llvm/test/CodeGen/AMDGPU/memory_clause.mir
index f7e295a91c8281..4b0226a0f6586b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.mir
@@ -263,10 +263,10 @@ body: |
# GCN-NEXT: dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, implicit $exec
# GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, implicit $exec
# GCN-NEXT: dead %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, implicit $exec
-# GCN-NEXT: KILL %0{{$}}
# GCN-NEXT: dead %9:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 128, 0, implicit $exec
# GCN-NEXT: dead %10:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 144, 0, implicit $exec
# GCN-NEXT: KILL %1{{$}}
+# GCN-NEXT: KILL %0{{$}}
---
name: reg_pressure
diff --git a/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll b/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll
index 239fa80ade98a9..04f2e3235d44a7 100644
--- a/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll
+++ b/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll
@@ -12,5 +12,5 @@ define amdgpu_kernel void @impossible_occupancy() #1 {
ret void
}
-attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="9" }
+attributes #0 = { "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-waves-per-eu"="9" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="11" }
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index bb7a591c914654..01eb1b1a353d12 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2994,71 +2994,70 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0
-; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_mov_b32_e32 v10, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2
-; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v12, vcc, s2, v2
+; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9]
+; VI-NEXT: flat_load_dwordx4 v[4:7], v[12:13]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mul_lo_u32 v10, v4, v3
-; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0
-; VI-NEXT: v_mul_lo_u32 v14, v5, v2
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
-; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10
-; VI-NEXT: v_mov_b32_e32 v10, v3
-; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11]
-; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v14
-; VI-NEXT: v_mov_b32_e32 v10, v4
-; VI-NEXT: v_mov_b32_e32 v4, v11
-; VI-NEXT: v_mul_lo_u32 v7, v7, v0
-; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13]
-; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4]
-; VI-NEXT: v_add_u32_e32 v13, vcc, v7, v13
-; VI-NEXT: v_mov_b32_e32 v0, v4
-; VI-NEXT: v_mul_lo_u32 v11, v6, v1
-; VI-NEXT: v_add_u32_e32 v6, vcc, v10, v0
-; VI-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, vcc
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
-; VI-NEXT: v_add_u32_e32 v5, vcc, v11, v13
-; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v12
-; VI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc
-; VI-NEXT: flat_store_dwordx4 v[8:9], v[2:5]
+; VI-NEXT: v_mul_lo_u32 v3, v4, v3
+; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v4, v2, 0
+; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
+; VI-NEXT: v_mul_lo_u32 v2, v5, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, v15, v3
+; VI-NEXT: v_add_u32_e32 v15, vcc, v3, v2
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: v_mov_b32_e32 v3, v10
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v5, v[2:3]
+; VI-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v6, v0, v[14:15]
+; VI-NEXT: v_add_u32_e32 v3, vcc, v4, v3
+; VI-NEXT: v_addc_u32_e64 v4, s[0:1], 0, 0, vcc
+; VI-NEXT: v_mul_lo_u32 v0, v7, v0
+; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v5, v[3:4]
+; VI-NEXT: v_mul_lo_u32 v1, v6, v1
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v10
+; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v10, vcc, v3, v9
+; VI-NEXT: v_addc_u32_e32 v11, vcc, v4, v0, vcc
+; VI-NEXT: v_mov_b32_e32 v9, v2
+; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul_i128:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0
-; GFX9-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 4, v0
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1]
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1]
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
-; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2
-; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10]
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
-; GFX9-NEXT: v_mul_lo_u32 v16, v7, v0
-; GFX9-NEXT: v_mov_b32_e32 v7, v12
-; GFX9-NEXT: v_mov_b32_e32 v12, v10
-; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12]
-; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, v10
-; GFX9-NEXT: v_mul_lo_u32 v4, v6, v1
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v0
-; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
-; GFX9-NEXT: v_add3_u32 v3, v16, v3, v4
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
-; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3]
+; GFX9-NEXT: v_mul_lo_u32 v10, v5, v2
+; GFX9-NEXT: v_mul_lo_u32 v13, v4, v3
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
+; GFX9-NEXT: v_add3_u32 v9, v9, v13, v10
+; GFX9-NEXT: v_mul_lo_u32 v13, v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v10, v3
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v0, v[8:9]
+; GFX9-NEXT: v_mov_b32_e32 v10, v4
+; GFX9-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4]
+; GFX9-NEXT: v_mul_lo_u32 v0, v7, v0
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v4
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, 0, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, v5, v[10:11]
+; GFX9-NEXT: v_add3_u32 v0, v0, v9, v13
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v0, vcc
+; GFX9-NEXT: global_store_dwordx4 v12, v[2:5], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_i128:
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 1e9994dd8e6efd..299bbdac600917 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -73,22 +73,22 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3)
; GFX9-NEXT: .LBB1_2: ; %bb23
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0
-; GFX9-NEXT: v_add_u32_e32 v18, v9, v0
; GFX9-NEXT: v_add_u32_e32 v12, v17, v0
-; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT: v_mul_u32_u24_e32 v19, v3, v5
-; GFX9-NEXT: v_add_u32_e32 v20, v3, v16
-; GFX9-NEXT: v_sub_u32_e32 v3, v18, v19
-; GFX9-NEXT: v_sub_u32_e32 v12, v12, v19
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v20, v15, v[3:4]
-; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v20, v13
+; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5
+; GFX9-NEXT: v_add_u32_e32 v19, v3, v16
+; GFX9-NEXT: v_add_u32_e32 v3, v9, v0
+; GFX9-NEXT: v_sub_u32_e32 v3, v3, v18
+; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18
+; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v13
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4]
; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v18, s[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4]
+; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18
; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7]
; GFX9-NEXT: global_load_dword v3, v[18:19], off
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 37bf8516403bf5..312dfa3717c777 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -1616,24 +1616,24 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16
; GFX9-NEXT: v_mad_f32 v3, -v17, v13, v3
; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17
-; GFX9-NEXT: v_mad_f32 v20, -v18, v4, v2
+; GFX9-NEXT: v_mad_f32 v2, -v18, v4, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12|
; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14
; GFX9-NEXT: v_or_b32_e32 v9, 1, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v13|
; GFX9-NEXT: v_or_b32_e32 v14, 1, v14
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, |v4|
-; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v14, vcc
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4|
+; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v14, vcc
; GFX9-NEXT: v_add_u32_e32 v1, v15, v1
-; GFX9-NEXT: v_add_u32_sdwa v2, v16, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_add_u32_e32 v3, v17, v3
-; GFX9-NEXT: v_add_u32_sdwa v4, v18, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_add_u32_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[5:6], v1, off
; GFX9-NEXT: global_store_dword v[7:8], v0, off
@@ -1952,71 +1952,71 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: global_load_dword v9, v[0:1], off
; GFX9-NEXT: s_mov_b32 s4, 0x2070306
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; GFX9-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v20, v3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v21, v14
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v19, v10
+; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4
+; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18
+; GFX9-NEXT: v_trunc_f32_e32 v18, v18
+; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v14|
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v3
+; GFX9-NEXT: v_mul_f32_e32 v14, v16, v19
+; GFX9-NEXT: v_trunc_f32_e32 v14, v14
+; GFX9-NEXT: v_mad_f32 v19, -v14, v10, v16
+; GFX9-NEXT: v_mul_f32_e32 v13, v10, v13
+; GFX9-NEXT: v_trunc_f32_e32 v13, v13
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v19|, |v10|
+; GFX9-NEXT: v_mad_f32 v10, -v13, v3, v10
; GFX9-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v22, v10
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v23, v16
-; GFX9-NEXT: v_mul_f32_e32 v20, v10, v20
-; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX9-NEXT: v_mul_f32_e32 v21, v13, v21
-; GFX9-NEXT: v_trunc_f32_e32 v20, v20
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v10|, |v3|
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v16
; GFX9-NEXT: v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2
-; GFX9-NEXT: v_mul_f32_e32 v22, v16, v22
-; GFX9-NEXT: v_mul_f32_e32 v23, v19, v23
-; GFX9-NEXT: v_trunc_f32_e32 v21, v21
-; GFX9-NEXT: v_mad_f32 v24, -v20, v3, v10
+; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
; GFX9-NEXT: v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
+; GFX9-NEXT: v_mul_f32_e32 v3, v19, v3
+; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v12, 30, v12
-; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
-; GFX9-NEXT: v_trunc_f32_e32 v22, v22
-; GFX9-NEXT: v_trunc_f32_e32 v23, v23
-; GFX9-NEXT: v_mad_f32 v13, -v21, v14, v13
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v24|, |v3|
-; GFX9-NEXT: v_xor_b32_sdwa v18, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
+; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
+; GFX9-NEXT: v_cvt_i32_f32_e32 v13, v13
+; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18
+; GFX9-NEXT: v_cvt_i32_f32_e32 v14, v14
+; GFX9-NEXT: v_mad_f32 v19, -v3, v16, v19
+; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v15, 30, v15
; GFX9-NEXT: v_or_b32_e32 v12, 1, v12
-; GFX9-NEXT: v_cvt_i32_f32_e32 v20, v20
-; GFX9-NEXT: v_cvt_i32_f32_e32 v21, v21
-; GFX9-NEXT: v_mad_f32 v25, -v22, v10, v16
-; GFX9-NEXT: v_cvt_i32_f32_e32 v22, v22
-; GFX9-NEXT: v_mad_f32 v19, -v23, v16, v19
-; GFX9-NEXT: v_cvt_i32_f32_e32 v23, v23
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v14|
-; GFX9-NEXT: v_ashrrev_i32_e32 v18, 30, v18
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10
; GFX9-NEXT: v_or_b32_e32 v15, 1, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v12, vcc
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v25|, |v10|
-; GFX9-NEXT: v_or_b32_e32 v18, 1, v18
-; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v15, vcc
+; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_or_b32_e32 v10, 1, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v16|
-; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v18, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v15, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v4
-; GFX9-NEXT: v_add_u32_e32 v2, v20, v2
-; GFX9-NEXT: v_add_u32_e32 v3, v21, v3
-; GFX9-NEXT: v_add_u32_e32 v10, v22, v10
-; GFX9-NEXT: v_add_u32_e32 v12, v23, v12
-; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4
+; GFX9-NEXT: v_add_u32_e32 v2, v13, v2
+; GFX9-NEXT: v_add_u32_e32 v12, v18, v12
+; GFX9-NEXT: v_add_u32_e32 v13, v14, v15
+; GFX9-NEXT: v_add_u32_e32 v3, v3, v10
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT: v_mul_lo_u32 v3, v3, v11
-; GFX9-NEXT: v_mul_lo_u32 v4, v10, v0
-; GFX9-NEXT: v_mul_lo_u32 v10, v12, v17
+; GFX9-NEXT: v_mul_lo_u32 v4, v12, v11
+; GFX9-NEXT: v_mul_lo_u32 v10, v13, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v3, v17
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2
-; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT: v_sub_u32_e32 v3, v17, v4
-; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_sub_u32_e32 v4, v17, v10
+; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[5:6], v0, off
; GFX9-NEXT: global_store_dword v[7:8], v1, off
@@ -2503,39 +2503,39 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15
; GFX9-NEXT: v_trunc_f32_e32 v16, v16
; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17
-; GFX9-NEXT: v_mad_f32 v20, -v16, v3, v3
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v2
+; GFX9-NEXT: v_mad_f32 v2, -v16, v3, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v13, v9
; GFX9-NEXT: v_trunc_f32_e32 v17, v17
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v2
; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18
-; GFX9-NEXT: v_mad_f32 v21, -v17, v11, v3
+; GFX9-NEXT: v_mad_f32 v19, -v17, v11, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v17
-; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v15, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
; GFX9-NEXT: v_trunc_f32_e32 v18, v18
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, v3
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3
; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13
; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v18
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v16, vcc
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v21|, v11
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v17, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v16, vcc
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v11
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v17, vcc
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, v14
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v18, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v18, vcc
; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT: v_mul_lo_u32 v3, v3, v10
-; GFX9-NEXT: v_mul_lo_u32 v0, v11, v0
-; GFX9-NEXT: v_mul_lo_u32 v4, v13, v12
-; GFX9-NEXT: v_sub_u32_e32 v2, v10, v2
-; GFX9-NEXT: v_sub_u32_sdwa v3, v10, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u32 v4, v15, v4
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, v10
+; GFX9-NEXT: v_mul_lo_u32 v0, v3, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v11, v12
+; GFX9-NEXT: v_sub_u32_e32 v4, v10, v4
+; GFX9-NEXT: v_sub_u32_sdwa v2, v10, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_sub_u32_e32 v0, v10, v0
-; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[5:6], v0, off
; GFX9-NEXT: global_store_dword v[7:8], v1, off
diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index f496a4b06bb237..81925de8910f80 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -5,8 +5,8 @@
# is killed by that store.
# GCN-LABEL: name: global_sextload_v32i32_to_v32i64
-# GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
-# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46
+# GCN: renamable $vgpr34_vgpr35_vgpr36_vgpr37 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr26_vgpr27_vgpr28_vgpr29, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46
# GCN-GCNTRACKER-LABEL: name: global_sextload_v32i32_to_v32i64
# GCN-GCNTRACKER-NOT: SI_SPILL
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index a2a0107a6f7d81..a1197aeace86f0 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -361,96 +361,96 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0
-; GFX8-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
+; GFX8-NEXT: v_and_b32_e32 v12, 0xfe000000, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 3
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v12, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s35
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x5000
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-NEXT: v_mov_b32_e32 v10, 0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, 0
; GFX8-NEXT: s_movk_i32 s0, 0x7f
; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB1_2 Depth 2
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: .LBB1_2: ; %for.body
; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v4
-; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v4
-; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v4
-; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8]
-; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10]
-; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v4
-; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v4
-; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12]
-; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14]
-; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v4
-; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v5, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16]
-; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18]
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v4
-; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe800, v4
-; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20]
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v5, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[21:22], v[21:22]
-; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xfffff000, v4
-; GFX8-NEXT: v_addc_u32_e32 v24, vcc, -1, v5, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[23:24], v[23:24]
-; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0xfffff800, v4
-; GFX8-NEXT: v_addc_u32_e32 v26, vcc, -1, v5, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[25:26], v[25:26]
-; GFX8-NEXT: flat_load_dwordx2 v[27:28], v[4:5]
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x10000, v4
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffb000, v2
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[4:5]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffb800, v2
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[6:7]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffc000, v2
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[4:5]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffd800, v2
+; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe000, v2
+; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5]
+; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[19:20]
; GFX8-NEXT: s_addk_i32 s1, 0x2000
; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff
-; GFX8-NEXT: s_waitcnt vmcnt(10)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(9)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v10, v3, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(8)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v12, v3, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(5)
+; GFX8-NEXT: v_add_u32_e32 v23, vcc, v13, v10
+; GFX8-NEXT: v_addc_u32_e32 v24, vcc, v14, v11, vcc
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffe800, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xfffff000, v2
+; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[21:22]
+; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
+; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v3, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(6)
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, v15, v23
+; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v16, v24, vcc
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xfffff800, v2
+; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14]
+; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16]
+; GFX8-NEXT: s_waitcnt vmcnt(7)
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, v17, v21
+; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v18, v22, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x10000, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(7)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v13, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v21
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v22, vcc
; GFX8-NEXT: s_waitcnt vmcnt(6)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v15, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v16, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
; GFX8-NEXT: s_waitcnt vmcnt(5)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v17, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v18, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
; GFX8-NEXT: s_waitcnt vmcnt(4)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v19, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v20, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v19, v4
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v20, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v21, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v22, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v4
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v23, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v24, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v25, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v26, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v27, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v28, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v17, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v18, v5, vcc
; GFX8-NEXT: s_cbranch_scc0 .LBB1_2
; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1
@@ -462,9 +462,9 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-NEXT: s_branch .LBB1_1
; GFX8-NEXT: .LBB1_5: ; %while.end
; GFX8-NEXT: v_mov_b32_e32 v1, s35
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v6
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v12
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[10:11]
; GFX8-NEXT: s_endpgm
;
; GFX900-LABEL: clmem_read:
@@ -496,91 +496,92 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT: s_movk_i32 s0, 0x5000
; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX900-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-NEXT: v_mov_b32_e32 v4, 0
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX900-NEXT: s_movk_i32 s2, 0x7f
-; GFX900-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-NEXT: s_movk_i32 s0, 0xd000
-; GFX900-NEXT: s_movk_i32 s1, 0xe000
-; GFX900-NEXT: s_movk_i32 s3, 0xf000
+; GFX900-NEXT: s_movk_i32 s4, 0x7f
+; GFX900-NEXT: v_mov_b32_e32 v5, 0
+; GFX900-NEXT: s_movk_i32 s2, 0xd000
+; GFX900-NEXT: s_movk_i32 s3, 0xe000
+; GFX900-NEXT: s_movk_i32 s5, 0xf000
; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX900-NEXT: ; =>This Loop Header: Depth=1
; GFX900-NEXT: ; Child Loop BB1_2 Depth 2
-; GFX900-NEXT: v_mov_b32_e32 v5, v1
-; GFX900-NEXT: v_mov_b32_e32 v4, v0
-; GFX900-NEXT: s_mov_b32 s4, 0
+; GFX900-NEXT: v_mov_b32_e32 v3, v1
+; GFX900-NEXT: v_mov_b32_e32 v2, v0
+; GFX900-NEXT: s_mov_b32 s6, 0
; GFX900-NEXT: .LBB1_2: ; %for.body
; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX900-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v5, vcc
-; GFX900-NEXT: global_load_dwordx2 v[9:10], v[4:5], off offset:-4096
-; GFX900-NEXT: global_load_dwordx2 v[11:12], v[4:5], off offset:-2048
-; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v4
+; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v2
+; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v3, vcc
+; GFX900-NEXT: global_load_dwordx2 v[9:10], v[2:3], off offset:-4096
+; GFX900-NEXT: global_load_dwordx2 v[11:12], v[2:3], off offset:-2048
+; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v2
; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off
-; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v5, vcc
+; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048
-; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s0, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v5, vcc
+; GFX900-NEXT: global_load_dwordx2 v[19:20], v[13:14], off
+; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s2, v2
+; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v3, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, s3, v2
; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048
-; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, s1, v4
-; GFX900-NEXT: global_load_dwordx2 v[13:14], v[13:14], off
-; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v5, vcc
-; GFX900-NEXT: global_load_dwordx2 v[23:24], v[19:20], off offset:-4096
-; GFX900-NEXT: global_load_dwordx2 v[25:26], v[19:20], off offset:-2048
-; GFX900-NEXT: global_load_dwordx2 v[27:28], v[19:20], off
-; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, s3, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v5, vcc
-; GFX900-NEXT: global_load_dwordx2 v[19:20], v[21:22], off offset:-2048
-; GFX900-NEXT: global_load_dwordx2 v[29:30], v[4:5], off
-; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0x10000, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX900-NEXT: s_addk_i32 s4, 0x2000
-; GFX900-NEXT: s_cmp_gt_u32 s4, 0x3fffff
-; GFX900-NEXT: s_waitcnt vmcnt(8)
-; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(7)
-; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v17, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v18, v3, vcc
+; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc
+; GFX900-NEXT: s_addk_i32 s6, 0x2000
+; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff
+; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, v7, v4
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc
+; GFX900-NEXT: global_load_dwordx2 v[7:8], v[13:14], off offset:-4096
+; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: v_add_co_u32_e64 v23, s[0:1], v17, v21
+; GFX900-NEXT: v_addc_co_u32_e64 v24, s[0:1], v18, v5, s[0:1]
+; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048
+; GFX900-NEXT: global_load_dwordx2 v[21:22], v[13:14], off
+; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s5, v2
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
+; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
+; GFX900-NEXT: s_waitcnt vmcnt(5)
+; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23
+; GFX900-NEXT: global_load_dwordx2 v[13:14], v[2:3], off
+; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, v20, v24, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2
+; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX900-NEXT: s_waitcnt vmcnt(5)
-; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v14, v3, vcc
-; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v15, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v16, v3, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v15, v19
+; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, v16, v20, vcc
; GFX900-NEXT: s_waitcnt vmcnt(4)
-; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v23, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v24, v3, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v15
+; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v16, vcc
; GFX900-NEXT: s_waitcnt vmcnt(3)
-; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v25, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v26, v3, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v17, v7
+; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v18, v8, vcc
; GFX900-NEXT: s_waitcnt vmcnt(2)
-; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v27, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v28, v3, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v21, v7
+; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v22, v8, vcc
; GFX900-NEXT: s_waitcnt vmcnt(1)
-; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v19, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v20, v3, vcc
-; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v10, v3, vcc
-; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v11, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v8, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v10, v5, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v11, v4
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v12, v5, vcc
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v29, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v30, v3, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v14, v5, vcc
; GFX900-NEXT: s_cbranch_scc0 .LBB1_2
; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1
-; GFX900-NEXT: s_add_i32 s4, s2, -1
-; GFX900-NEXT: s_cmp_eq_u32 s2, 0
+; GFX900-NEXT: s_add_i32 s0, s4, -1
+; GFX900-NEXT: s_cmp_eq_u32 s4, 0
; GFX900-NEXT: s_cbranch_scc1 .LBB1_5
; GFX900-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX900-NEXT: s_mov_b32 s2, s4
+; GFX900-NEXT: s_mov_b32 s4, s0
; GFX900-NEXT: s_branch .LBB1_1
; GFX900-NEXT: .LBB1_5: ; %while.end
; GFX900-NEXT: v_mov_b32_e32 v1, s35
; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v6
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX900-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
; GFX900-NEXT: s_endpgm
;
; GFX10-LABEL: clmem_read:
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6583d5e8aa5a07..704947523f677c 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -70,22 +70,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
-; GFX9-NEXT: v_or_b32_e32 v13, v7, v9
+; GFX9-NEXT: v_or_b32_e32 v12, v7, v9
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
; GFX9-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT: v_xor_b32_e32 v11, 0x7f, v6
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT: v_xor_b32_e32 v10, 0x7f, v6
-; GFX9-NEXT: v_or_b32_e32 v12, v10, v8
+; GFX9-NEXT: v_or_b32_e32 v11, v11, v8
; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12]
; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, 0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, 0, s[4:5]
; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
@@ -107,47 +107,47 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_or_b32_e32 v8, v10, v12
; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13
-; GFX9-NEXT: v_lshlrev_b64 v[12:13], v13, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[10:11], v13, v[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-NEXT: v_mov_b32_e32 v12, 0
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v13, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v9, 0
-; GFX9-NEXT: v_mov_b32_e32 v11, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v13, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5]
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB0_5
; GFX9-NEXT: ; %bb.2: ; %udiv-preheader
-; GFX9-NEXT: v_sub_u32_e32 v10, 64, v24
+; GFX9-NEXT: v_sub_u32_e32 v12, 64, v24
; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GFX9-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3]
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT: v_or_b32_e32 v10, v8, v10
+; GFX9-NEXT: v_or_b32_e32 v12, v8, v12
; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v24
-; GFX9-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX9-NEXT: v_or_b32_e32 v13, v9, v13
; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[2:3]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v15, v9, v1, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc
; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e64 v14, v10, v0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v14, v12, v0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc
; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, -1, v23
; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v22, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, -1, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v18, 0
-; GFX9-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-NEXT: v_mov_b32_e32 v12, 0
; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, -1, v5, vcc
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: v_mov_b32_e32 v19, 0
-; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: .LBB0_3: ; %udiv-do-while
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -155,20 +155,20 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[14:15]
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 31, v7
; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v11
; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17]
; GFX9-NEXT: v_or_b32_e32 v14, v14, v33
-; GFX9-NEXT: v_or3_b32 v6, v6, v8, v10
+; GFX9-NEXT: v_or3_b32 v6, v6, v8, v12
; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v28, v14
; GFX9-NEXT: v_or_b32_e32 v16, v16, v32
; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v29, v15, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v30, v16, vcc
-; GFX9-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13]
+; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11]
; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v31, v17, vcc
; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; GFX9-NEXT: v_or_b32_e32 v12, v18, v12
+; GFX9-NEXT: v_or_b32_e32 v10, v18, v10
; GFX9-NEXT: v_and_b32_e32 v18, v8, v23
-; GFX9-NEXT: v_or_b32_e32 v13, v19, v13
+; GFX9-NEXT: v_or_b32_e32 v11, v19, v11
; GFX9-NEXT: v_and_b32_e32 v19, v8, v22
; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v14, v18
; GFX9-NEXT: v_and_b32_e32 v32, v8, v4
@@ -185,7 +185,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
; GFX9-NEXT: v_mov_b32_e32 v19, v9
-; GFX9-NEXT: v_or3_b32 v7, v7, 0, v11
+; GFX9-NEXT: v_or3_b32 v7, v7, 0, v13
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v18, v8
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
@@ -194,12 +194,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: .LBB0_5: ; %Flow2
; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[12:13]
+; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[10:11]
; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v13
-; GFX9-NEXT: v_or3_b32 v11, v7, 0, v11
-; GFX9-NEXT: v_or3_b32 v12, v6, v12, v10
-; GFX9-NEXT: v_or_b32_e32 v10, v9, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 31, v11
+; GFX9-NEXT: v_or3_b32 v10, v7, 0, v13
+; GFX9-NEXT: v_or3_b32 v12, v6, v11, v12
+; GFX9-NEXT: v_or_b32_e32 v11, v9, v15
; GFX9-NEXT: v_or_b32_e32 v13, v8, v14
; GFX9-NEXT: .LBB0_6: ; %Flow3
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
@@ -209,19 +209,19 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[14:15]
-; GFX9-NEXT: v_mul_lo_u32 v9, v10, v4
-; GFX9-NEXT: v_mul_lo_u32 v11, v11, v23
+; GFX9-NEXT: v_mul_lo_u32 v9, v11, v4
+; GFX9-NEXT: v_mul_lo_u32 v10, v10, v23
; GFX9-NEXT: v_mov_b32_e32 v4, v14
; GFX9-NEXT: v_mov_b32_e32 v14, v15
-; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v23, v10, v[13:14]
+; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v23, v11, v[13:14]
; GFX9-NEXT: v_add3_u32 v8, v8, v16, v9
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8]
; GFX9-NEXT: v_mov_b32_e32 v8, v14
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v8
; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
; GFX9-NEXT: v_mul_lo_u32 v12, v12, v22
-; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v22, v10, v[8:9]
-; GFX9-NEXT: v_add3_u32 v4, v11, v7, v12
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v22, v11, v[8:9]
+; GFX9-NEXT: v_add3_u32 v4, v10, v7, v12
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v7, v13
@@ -1628,38 +1628,38 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: .LBB1_3: ; %udiv-do-while
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[10:11]
; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11]
-; GFX9-NEXT: v_lshlrev_b64 v[18:19], 1, v[18:19]
-; GFX9-NEXT: v_or_b32_e32 v10, v20, v10
+; GFX9-NEXT: v_or_b32_e32 v10, v20, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v17
; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17]
+; GFX9-NEXT: v_or_b32_e32 v11, v21, v31
+; GFX9-NEXT: v_lshlrev_b64 v[18:19], 1, v[18:19]
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 31, v9
+; GFX9-NEXT: v_or_b32_e32 v16, v16, v21
; GFX9-NEXT: v_or_b32_e32 v18, v18, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v9
-; GFX9-NEXT: v_or_b32_e32 v16, v16, v20
; GFX9-NEXT: v_sub_co_u32_e32 v20, vcc, v26, v16
; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v27, v17, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v28, v18, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v29, v19, vcc
; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v20
; GFX9-NEXT: v_and_b32_e32 v20, v30, v4
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v16, v20
; GFX9-NEXT: v_and_b32_e32 v20, v30, v5
; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v20, vcc
-; GFX9-NEXT: v_and_b32_e32 v20, v30, v6
-; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v18, v20, vcc
+; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14
+; GFX9-NEXT: v_and_b32_e32 v12, v30, v6
; GFX9-NEXT: v_and_b32_e32 v20, v30, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v18, v12, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v19, v20, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22
; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc
-; GFX9-NEXT: v_or_b32_e32 v11, v21, v11
-; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
; GFX9-NEXT: v_or_b32_e32 v20, v22, v24
; GFX9-NEXT: v_or_b32_e32 v21, v23, v25
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
-; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14
; GFX9-NEXT: v_and_b32_e32 v12, 1, v30
; GFX9-NEXT: v_mov_b32_e32 v21, v13
; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15
diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
index a4335095115842..dc5e442c2b2622 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
+++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
@@ -8,7 +8,7 @@
; GCN-NOT: v_writelane_b32
; GCN: s_cbranch_{{[^ ]+}} [[LOOP]]
; GCN: .sgpr_spill_count: 0
-define amdgpu_kernel void @test_remat_sgpr(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) {
+define amdgpu_kernel void @test_remat_sgpr(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb3
@@ -43,3 +43,5 @@ bb3: ; preds = %bb3, %bb
declare double @llvm.fma.f64(double, double, double)
declare i32 @llvm.amdgcn.workitem.id.x()
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1024,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index 8bbae59f468f1d..cbd1714a5e375e 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -127,7 +127,7 @@ define void @test_func() !dbg !6 {
; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0
; STDERR-NEXT: remark: foo.cl:8:0: Dynamic Stack: False
-; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 10
; STDERR-NEXT: remark: foo.cl:8:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:8:0: LDS Size [bytes/block]: 0
@@ -146,7 +146,7 @@ define void @empty_func() !dbg !8 {
; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: test_indirect_call.num_agpr
; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: 0
; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0))
+; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:64:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:64:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:64:0: LDS Size [bytes/block]: 0
@@ -164,7 +164,7 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 {
; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: test_indirect_w_static_stack.num_agpr
; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144
; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0))
+; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:74:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 8f4a4b5afcdc1e..554e3640221b94 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -1675,7 +1675,7 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260
; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0x3ff00000
+; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0x3ff00000
; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -1716,23 +1716,22 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9]
; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5]
-; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v20
-; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
-; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
-; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v18
+; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[8:9], v[6:7], 1.0
; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7]
+; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7]
-; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
-; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v20
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[14:15]
+; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7]
+; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18
+; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17]
; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
-; SI-GISEL-NEXT: s_nop 0
; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1978,7 +1977,7 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260
; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0xbff00000
+; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0xbff00000
; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -2019,23 +2018,22 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9]
; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5]
-; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v20
-; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
-; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
-; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v18
+; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[8:9], v[6:7], 1.0
; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7]
+; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[6:7], -1.0, v[2:3], -1.0
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7]
-; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
-; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v20
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[14:15]
+; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7]
+; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18
+; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17]
; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
-; SI-GISEL-NEXT: s_nop 0
; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2245,8 +2243,8 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5
; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
-; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80
-; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc
; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -2254,60 +2252,60 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8
; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3]
; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc
-; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
-; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8
-; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
-; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5
-; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11]
+; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
-; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15
-; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11]
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5]
-; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[10:11]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0
; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
-; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], s[4:5]
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[4:5], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9]
-; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5]
-; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
-; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[14:15], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[12:13], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], v[12:13]
+; SI-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v10
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
-; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], s[4:5], v[2:3], s[4:5]
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[14:15], 1.0
+; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5]
; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11
-; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[14:15], v[12:13], v[14:15]
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
-; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v19
-; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
+; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[16:17], v[10:11]
+; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[18:19], v[4:5], v[8:9]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], v[16:17]
+; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v17
+; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
; SI-GISEL-NEXT: s_nop 0
-; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[10:11], v[12:13]
; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5]
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2520,8 +2518,8 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5
; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
-; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80
-; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc
; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -2529,61 +2527,61 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8
; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3]
; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc
-; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
-; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8
-; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
-; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5
-; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11]
+; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
-; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15
-; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11]
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5]
-; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[10:11]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0
; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
-; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
-; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9]
-; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5]
-; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
-; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[4:5], 1.0
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[14:15], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[12:13], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], v[12:13]
+; SI-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v10
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
-; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[14:15], 1.0
+; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11
-; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[14:15], v[12:13], v[14:15]
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
-; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0x3ff00000
-; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
-; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v8
-; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
+; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[16:17], v[10:11]
+; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[18:19], v[4:5], v[8:9]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], v[16:17]
+; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000
+; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v6
+; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
; SI-GISEL-NEXT: s_nop 0
-; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[10:11], v[12:13]
; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
index 6d79837feb1289..6796391aba6751 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
@@ -42,7 +42,7 @@ body: |
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead %11
; CHECK-NEXT: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]]
+ ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3)
; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %15, 851978 /* regdef:VGPR_16 */, def %16
; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
@@ -50,8 +50,8 @@ body: |
; CHECK-NEXT: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %21, 851978 /* regdef:VGPR_16 */, def %22
; CHECK-NEXT: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
- ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_3]], 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_4]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 851977 /* reguse:VGPR_16 */, %15, 851977 /* reguse:VGPR_16 */, %16, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_2]]
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]]
; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3)
; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3)
; CHECK-NEXT: DS_WRITE_B64_gfx9 undef %30:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index 268322bd074bfd..648f4fc64f9d03 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -44,9 +44,9 @@ entry:
; CHECK-LABEL: {{^}}global_extload_v16f16_to_v16f64:
; TONGA: NumSgprs: 96
; TONGA-GCNTRACKERS: NumSgprs: 96
-; TONGA: NumVgprs: 33
-; TONGA-GCNTRACKERS: NumVgprs: 25
-; TONGA: Occupancy: 7
+; TONGA: NumVgprs: 21
+; TONGA-GCNTRACKERS: NumVgprs: 23
+; TONGA: Occupancy: 8
; TONGA-GCNTRACKERS: Occupancy: 8
@@ -59,11 +59,11 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; CHECK-LABEL: {{^}}constant_zextload_v64i16_to_v64i32:
; GENERIC: NumSgprs: 71
-; GENERIC-GCNTRACKERS: NumSgprs: 54
-; GENERIC: NumVgprs: 16
-; GENERIC-GCNTRACKERS: NumVgprs: 16
+; GENERIC-GCNTRACKERS: NumSgprs: 45
+; GENERIC: NumVgprs: 20
+; GENERIC-GCNTRACKERS: NumVgprs: 20
; GENERIC: Occupancy: 7
-; GENERIC-GCNTRACKERS: Occupancy: 8
+; GENERIC-GCNTRACKERS: Occupancy: 10
define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
%load = load <64 x i16>, ptr addrspace(4) %in
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
index 9429d1565962e4..e67036f0bbbea2 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
@@ -16,20 +16,20 @@ body: |
; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub3:vreg_128 = COPY $vgpr9
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub2:vreg_128 = COPY $vgpr8
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_128 = COPY $vgpr7
- ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6
- ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub3:vreg_128 = COPY $vgpr5
- ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub2:vreg_128 = COPY $vgpr4
- ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
- ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6
+ ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub3:vreg_128 = COPY $vgpr5
+ ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub2:vreg_128 = COPY $vgpr4
; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub1:vreg_128 = COPY $vgpr3
; CHECK-NEXT: undef [[COPY8:%[0-9]+]].sub0:vreg_128 = COPY $vgpr2
; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY6]].sub2, implicit $exec
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub3, implicit $exec
; CHECK-NEXT: S_BARRIER
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub2, implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY4]].sub3, implicit $exec
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY3]].sub0, implicit $exec
+ ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec
; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY2]].sub1, implicit $exec
; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY1]].sub2, implicit $exec
; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY]].sub3, implicit $exec
@@ -37,7 +37,7 @@ body: |
; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET]], [[BUFFER_LOAD_DWORD_OFFSET]], implicit $exec
; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET1]], [[BUFFER_LOAD_DWORD_OFFSET1]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MUL_LO_U32_e64_]], [[V_MUL_LO_U32_e64_1]], implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY6]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
undef %43.sub3:vreg_128 = COPY $vgpr9
undef %42.sub2:vreg_128 = COPY $vgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
index bd1258cb1cf980..1e5d6755fbc85f 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
@@ -42,4 +42,4 @@ bb2:
declare i32 @llvm.amdgcn.workitem.id.x() #0
attributes #0 = { nounwind readnone }
-attributes #1 = { "amdgpu-num-vgpr"="9" }
+attributes #1 = { "amdgpu-num-vgpr"="9" "amdgpu-flat-work-group-size"="1024,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
index 71f8d91874f04f..5a30d5d5e42ec2 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
@@ -7,16 +7,16 @@
; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target
; CHECK-LABEL: {{^}}load_fma_store:
-; OCC: NumVgprs: 32
-; OCC-GCNTRACKER: NumVgprs: 24
+; OCC: NumVgprs: 24
+; OCC-GCNTRACKER: NumVgprs: 26
; RELAX: NumVgprs: 64
; RELAX-GCNTRACKER: NumVgprs: 60
-; OCC: NumVGPRsForWavesPerEU: 32
-; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 24
+; OCC: NumVGPRsForWavesPerEU: 24
+; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 26
; RELAX: NumVGPRsForWavesPerEU: 64
; RELAX-GCNTRACKER: NumVGPRsForWavesPerEU: 60
-; OCC: Occupancy: 8
-; OCC-GCNTRACKER: Occupancy: 8
+; OCC: Occupancy: 10
+; OCC-GCNTRACKER: Occupancy: 9
; RELAX: Occupancy: 4
; RELAX-GCNTRACKER: Occupancy: 4
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 6225ff73e28d08..57c54c4de71027 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -792,255 +792,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: sdiv_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s10, -1
-; GCN-NEXT: s_mov_b32 s6, s10
-; GCN-NEXT: s_mov_b32 s7, s11
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s2
-; GCN-NEXT: s_mov_b32 s5, s3
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GCN-NEXT: s_mov_b32 s8, s0
-; GCN-NEXT: s_mov_b32 s9, s1
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1
+; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v5
+; GCN-NEXT: v_xor_b32_e32 v11, v1, v5
+; GCN-NEXT: v_max_i32_e32 v5, v5, v12
+; GCN-NEXT: v_cvt_f32_u32_e32 v12, v5
; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
-; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5
-; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
; GCN-NEXT: v_xor_b32_e32 v8, v0, v4
-; GCN-NEXT: v_xor_b32_e32 v11, v1, v5
-; GCN-NEXT: v_xor_b32_e32 v14, v2, v6
+; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12
; GCN-NEXT: v_max_i32_e32 v4, v4, v10
-; GCN-NEXT: v_max_i32_e32 v5, v5, v13
-; GCN-NEXT: v_max_i32_e32 v6, v6, v16
-; GCN-NEXT: v_max_i32_e32 v1, v1, v12
-; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v14
+; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v12
+; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10
; GCN-NEXT: v_cvt_f32_u32_e32 v12, v4
-; GCN-NEXT: v_cvt_f32_u32_e32 v14, v5
-; GCN-NEXT: v_cvt_f32_u32_e32 v16, v6
-; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
+; GCN-NEXT: v_max_i32_e32 v1, v1, v13
+; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6
+; GCN-NEXT: v_mul_lo_u32 v16, v16, v10
; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12
-; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14
-; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16
-; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v2
+; GCN-NEXT: v_xor_b32_e32 v14, v2, v6
+; GCN-NEXT: v_max_i32_e32 v6, v6, v15
+; GCN-NEXT: v_mul_hi_u32 v16, v10, v16
; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12
-; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14
-; GCN-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16
; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12
-; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14
-; GCN-NEXT: v_cvt_u32_f32_e32 v16, v16
-; GCN-NEXT: v_sub_i32_e32 v17, vcc, 0, v7
+; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16
+; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4
+; GCN-NEXT: v_mul_lo_u32 v16, v16, v12
+; GCN-NEXT: v_mul_hi_u32 v10, v1, v10
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
+; GCN-NEXT: v_mul_hi_u32 v13, v12, v16
; GCN-NEXT: v_max_i32_e32 v0, v0, v9
-; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v11
-; GCN-NEXT: v_max_i32_e32 v2, v2, v15
-; GCN-NEXT: v_max_i32_e32 v11, v7, v17
-; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v4
-; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v5
-; GCN-NEXT: v_sub_i32_e32 v17, vcc, 0, v6
-; GCN-NEXT: v_mul_lo_u32 v13, v13, v12
-; GCN-NEXT: v_mul_lo_u32 v15, v15, v14
-; GCN-NEXT: v_mul_lo_u32 v17, v17, v16
-; GCN-NEXT: v_cvt_f32_u32_e32 v18, v11
-; GCN-NEXT: v_mul_hi_u32 v13, v12, v13
-; GCN-NEXT: v_mul_hi_u32 v15, v14, v15
-; GCN-NEXT: v_mul_hi_u32 v17, v16, v17
-; GCN-NEXT: v_rcp_iflag_f32_e32 v18, v18
+; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15
+; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GCN-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GCN-NEXT: v_add_i32_e32 v13, vcc, v14, v15
-; GCN-NEXT: v_add_i32_e32 v14, vcc, v16, v17
+; GCN-NEXT: v_mul_lo_u32 v13, v10, v5
; GCN-NEXT: v_mul_hi_u32 v12, v0, v12
-; GCN-NEXT: v_mul_hi_u32 v13, v1, v13
-; GCN-NEXT: v_mul_hi_u32 v14, v2, v14
-; GCN-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18
-; GCN-NEXT: v_mul_lo_u32 v15, v12, v4
-; GCN-NEXT: v_mul_lo_u32 v17, v13, v5
-; GCN-NEXT: v_mul_lo_u32 v21, v14, v6
-; GCN-NEXT: v_cvt_u32_f32_e32 v18, v18
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v15
-; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v17
-; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v21
-; GCN-NEXT: v_add_i32_e32 v16, vcc, 1, v12
-; GCN-NEXT: v_add_i32_e32 v20, vcc, 1, v13
-; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v14
-; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
-; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5
-; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
-; GCN-NEXT: v_sub_i32_e32 v19, vcc, 0, v11
-; GCN-NEXT: v_sub_i32_e32 v17, vcc, v0, v4
-; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1]
-; GCN-NEXT: v_sub_i32_e32 v16, vcc, v1, v5
-; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v20, s[2:3]
-; GCN-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5]
-; GCN-NEXT: v_mul_lo_u32 v19, v19, v18
-; GCN-NEXT: v_sub_i32_e32 v20, vcc, v2, v6
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v17, s[0:1]
-; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v12
-; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v16, s[2:3]
-; GCN-NEXT: v_add_i32_e32 v16, vcc, 1, v13
-; GCN-NEXT: v_add_i32_e32 v17, vcc, 1, v14
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
-; GCN-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v16, vcc
+; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v13
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5
+; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v13, vcc, v1, v5
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5
+; GCN-NEXT: v_mul_lo_u32 v1, v12, v4
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v6
+; GCN-NEXT: v_mul_lo_u32 v5, v5, v9
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12
+; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4
+; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3]
+; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v4
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3]
+; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7
+; GCN-NEXT: v_mul_hi_u32 v4, v9, v5
+; GCN-NEXT: v_max_i32_e32 v5, v7, v0
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
+; GCN-NEXT: v_max_i32_e32 v2, v2, v9
+; GCN-NEXT: v_mul_hi_u32 v4, v2, v4
+; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v12, s[2:3]
; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
-; GCN-NEXT: v_xor_b32_e32 v1, v1, v9
-; GCN-NEXT: v_mul_hi_u32 v4, v18, v19
-; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v20, s[4:5]
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
-; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
+; GCN-NEXT: v_mul_lo_u32 v8, v4, v6
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10
+; GCN-NEXT: v_cndmask_b32_e64 v1, v10, v13, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11
+; GCN-NEXT: v_mul_lo_u32 v10, v10, v9
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6
+; GCN-NEXT: v_xor_b32_e32 v1, v1, v11
+; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11
+; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
-; GCN-NEXT: v_cndmask_b32_e32 v2, v14, v17, vcc
-; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GCN-NEXT: v_max_i32_e32 v5, v3, v5
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v18, v4
-; GCN-NEXT: v_mul_hi_u32 v4, v5, v4
-; GCN-NEXT: v_xor_b32_e32 v2, v2, v10
-; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GCN-NEXT: v_mul_lo_u32 v6, v4, v11
+; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc
+; GCN-NEXT: v_mul_hi_u32 v4, v9, v10
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
+; GCN-NEXT: v_max_i32_e32 v6, v3, v6
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT: v_mul_hi_u32 v4, v6, v4
+; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14
+; GCN-NEXT: v_xor_b32_e32 v2, v2, v14
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14
+; GCN-NEXT: v_mul_lo_u32 v8, v4, v5
; GCN-NEXT: v_xor_b32_e32 v3, v3, v7
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3
-; GCN-NEXT: v_sub_i32_e32 v5, vcc, v5, v6
-; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GCN-NEXT: v_sub_i32_e32 v7, vcc, v5, v11
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
; GCN-NEXT: v_xor_b32_e32 v4, v4, v3
; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; TONGA-LABEL: sdiv_v4i32:
; TONGA: ; %bb.0:
; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; TONGA-NEXT: s_mov_b32 s11, 0xf000
-; TONGA-NEXT: s_mov_b32 s10, -1
-; TONGA-NEXT: s_mov_b32 s6, s10
-; TONGA-NEXT: s_mov_b32 s7, s11
+; TONGA-NEXT: s_mov_b32 s7, 0xf000
+; TONGA-NEXT: s_mov_b32 s6, -1
+; TONGA-NEXT: s_mov_b32 s10, s6
+; TONGA-NEXT: s_mov_b32 s11, s7
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s4, s2
-; TONGA-NEXT: s_mov_b32 s5, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; TONGA-NEXT: s_mov_b32 s8, s0
-; TONGA-NEXT: s_mov_b32 s9, s1
+; TONGA-NEXT: s_mov_b32 s8, s2
+; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; TONGA-NEXT: s_mov_b32 s4, s0
+; TONGA-NEXT: s_mov_b32 s5, s1
; TONGA-NEXT: s_waitcnt vmcnt(1)
-; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1
+; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v1
; TONGA-NEXT: s_waitcnt vmcnt(0)
+; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v5
+; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5
+; TONGA-NEXT: v_max_i32_e32 v5, v5, v12
+; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v5
; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4
-; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5
-; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v6
; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4
-; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5
-; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12
; TONGA-NEXT: v_max_i32_e32 v4, v4, v10
-; TONGA-NEXT: v_max_i32_e32 v5, v5, v13
-; TONGA-NEXT: v_max_i32_e32 v6, v6, v16
-; TONGA-NEXT: v_max_i32_e32 v1, v1, v12
-; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v14
+; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5
+; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v12
+; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10
; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v4
-; TONGA-NEXT: v_cvt_f32_u32_e32 v14, v5
-; TONGA-NEXT: v_cvt_f32_u32_e32 v16, v6
-; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0
+; TONGA-NEXT: v_max_i32_e32 v1, v1, v13
+; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6
+; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10
; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v14, v14
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v16, v16
-; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v2
+; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6
+; TONGA-NEXT: v_max_i32_e32 v6, v6, v15
+; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16
; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12
-; TONGA-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14
-; TONGA-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16
; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12
-; TONGA-NEXT: v_cvt_u32_f32_e32 v14, v14
-; TONGA-NEXT: v_cvt_u32_f32_e32 v16, v16
-; TONGA-NEXT: v_sub_u32_e32 v17, vcc, 0, v7
+; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6
+; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16
+; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4
+; TONGA-NEXT: v_mul_lo_u32 v16, v16, v12
+; TONGA-NEXT: v_mul_hi_u32 v10, v1, v10
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0
+; TONGA-NEXT: v_mul_hi_u32 v13, v12, v16
; TONGA-NEXT: v_max_i32_e32 v0, v0, v9
-; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v11
-; TONGA-NEXT: v_max_i32_e32 v2, v2, v15
-; TONGA-NEXT: v_max_i32_e32 v11, v7, v17
-; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v4
-; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v5
-; TONGA-NEXT: v_sub_u32_e32 v17, vcc, 0, v6
-; TONGA-NEXT: v_mul_lo_u32 v13, v13, v12
-; TONGA-NEXT: v_mul_lo_u32 v15, v15, v14
-; TONGA-NEXT: v_mul_lo_u32 v17, v17, v16
-; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v11
-; TONGA-NEXT: v_mul_hi_u32 v13, v12, v13
-; TONGA-NEXT: v_mul_hi_u32 v15, v14, v15
-; TONGA-NEXT: v_mul_hi_u32 v17, v16, v17
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v18, v18
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15
+; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; TONGA-NEXT: v_add_u32_e32 v12, vcc, v12, v13
-; TONGA-NEXT: v_add_u32_e32 v13, vcc, v14, v15
-; TONGA-NEXT: v_add_u32_e32 v14, vcc, v16, v17
+; TONGA-NEXT: v_mul_lo_u32 v13, v10, v5
; TONGA-NEXT: v_mul_hi_u32 v12, v0, v12
-; TONGA-NEXT: v_mul_hi_u32 v13, v1, v13
-; TONGA-NEXT: v_mul_hi_u32 v14, v2, v14
-; TONGA-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18
-; TONGA-NEXT: v_mul_lo_u32 v15, v12, v4
-; TONGA-NEXT: v_mul_lo_u32 v17, v13, v5
-; TONGA-NEXT: v_mul_lo_u32 v21, v14, v6
-; TONGA-NEXT: v_cvt_u32_f32_e32 v18, v18
-; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v15
-; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v17
-; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v21
-; TONGA-NEXT: v_add_u32_e32 v16, vcc, 1, v12
-; TONGA-NEXT: v_add_u32_e32 v20, vcc, 1, v13
-; TONGA-NEXT: v_add_u32_e32 v15, vcc, 1, v14
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
-; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v11
-; TONGA-NEXT: v_sub_u32_e32 v17, vcc, v0, v4
-; TONGA-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1]
-; TONGA-NEXT: v_sub_u32_e32 v16, vcc, v1, v5
-; TONGA-NEXT: v_cndmask_b32_e64 v13, v13, v20, s[2:3]
-; TONGA-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5]
-; TONGA-NEXT: v_mul_lo_u32 v19, v19, v18
-; TONGA-NEXT: v_sub_u32_e32 v20, vcc, v2, v6
-; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v17, s[0:1]
-; TONGA-NEXT: v_add_u32_e32 v15, vcc, 1, v12
-; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v16, s[2:3]
-; TONGA-NEXT: v_add_u32_e32 v16, vcc, 1, v13
-; TONGA-NEXT: v_add_u32_e32 v17, vcc, 1, v14
-; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
-; TONGA-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc
-; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; TONGA-NEXT: v_cndmask_b32_e32 v1, v13, v16, vcc
+; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v13
+; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5
+; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v1, v5
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5
+; TONGA-NEXT: v_mul_lo_u32 v1, v12, v4
+; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v6
+; TONGA-NEXT: v_mul_lo_u32 v5, v5, v9
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
+; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3]
+; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v4
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3]
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7
+; TONGA-NEXT: v_mul_hi_u32 v4, v9, v5
+; TONGA-NEXT: v_max_i32_e32 v5, v7, v0
+; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5
+; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1
+; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2
+; TONGA-NEXT: v_max_i32_e32 v2, v2, v9
+; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4
+; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v1, v12, s[2:3]
; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
-; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9
-; TONGA-NEXT: v_mul_hi_u32 v4, v18, v19
-; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v20, s[4:5]
; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
-; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9
+; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6
+; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v10, v13, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5
+; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8
+; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11
+; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6
+; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11
+; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11
+; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
-; TONGA-NEXT: v_cndmask_b32_e32 v2, v14, v17, vcc
-; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v3
-; TONGA-NEXT: v_max_i32_e32 v5, v3, v5
-; TONGA-NEXT: v_add_u32_e32 v4, vcc, v18, v4
-; TONGA-NEXT: v_mul_hi_u32 v4, v5, v4
-; TONGA-NEXT: v_xor_b32_e32 v2, v2, v10
-; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10
-; TONGA-NEXT: v_mul_lo_u32 v6, v4, v11
+; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc
+; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3
+; TONGA-NEXT: v_max_i32_e32 v6, v3, v6
+; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4
+; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14
+; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14
+; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14
+; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5
; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7
+; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3
-; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v5, v6
-; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4
-; TONGA-NEXT: v_sub_u32_e32 v7, vcc, v5, v11
-; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11
-; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; TONGA-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4
-; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11
-; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3
; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3
-; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v4i32:
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 04a824a073a7eb..459ef648fd806c 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -873,20 +873,20 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-NEXT: v_lshrrev_b32_e32 v13, 16, v5
; NOSDWA-NEXT: v_mul_f16_e32 v1, v5, v1
; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; NOSDWA-NEXT: v_lshrrev_b32_e32 v14, 16, v4
; NOSDWA-NEXT: v_mul_f16_e32 v0, v4, v0
-; NOSDWA-NEXT: v_mul_f16_e32 v4, v11, v10
+; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; NOSDWA-NEXT: v_mul_f16_e32 v10, v11, v10
; NOSDWA-NEXT: v_mul_f16_e32 v7, v12, v7
; NOSDWA-NEXT: v_mul_f16_e32 v6, v13, v6
-; NOSDWA-NEXT: v_mul_f16_e32 v5, v14, v5
-; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; NOSDWA-NEXT: v_mul_f16_e32 v4, v4, v5
+; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v10
; NOSDWA-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; NOSDWA-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v4
+; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v5
; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v7
; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v6
-; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v5
+; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v4
; NOSDWA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; NOSDWA-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 572026da79646c..26a4a6743cffae 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -1508,52 +1508,52 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v9
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v14
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v15
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20
; SI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21
; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22
-; SI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23
-; SI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_vselect_v8f16:
@@ -1652,81 +1652,81 @@ define <16 x half> @v_select_v16f16(<16 x half> %a, <16 x half> %b, i32 %cond) {
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_or_b32_e32 v10, v10, v11
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32
-; SI-NEXT: v_cvt_f16_f32_e32 v11, v27
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32
; SI-NEXT: v_or_b32_e32 v13, v15, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v26
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_or_b32_e32 v11, v15, v11
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v27
+; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v26, v15
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v25
; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v23
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v26, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17
-; SI-NEXT: v_or_b32_e32 v8, v8, v9
-; SI-NEXT: v_or_b32_e32 v9, v24, v25
-; SI-NEXT: v_or_b32_e32 v22, v22, v23
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: v_or_b32_e32 v5, v20, v21
-; SI-NEXT: v_or_b32_e32 v3, v18, v3
+; SI-NEXT: v_or_b32_e32 v9, v24, v9
+; SI-NEXT: v_or_b32_e32 v7, v22, v7
+; SI-NEXT: v_or_b32_e32 v5, v20, v5
; SI-NEXT: v_or_b32_e32 v1, v16, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v28
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_or_b32_e32 v7, v26, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v11, v30
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v11, v3
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19
+; SI-NEXT: v_or_b32_e32 v11, v18, v11
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v11, v11, v2, vcc
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
-; SI-NEXT: v_cndmask_b32_e32 v15, v22, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
-; SI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v15, v15, v10, vcc
; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc
-; SI-NEXT: v_cndmask_b32_e32 v16, v7, v14, vcc
+; SI-NEXT: v_cndmask_b32_e32 v16, v3, v14, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v11
; SI-NEXT: v_cvt_f32_f16_e32 v4, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v15
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v7
; SI-NEXT: v_cvt_f32_f16_e32 v8, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v15
; SI-NEXT: v_cvt_f32_f16_e32 v12, v13
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v14, v16
@@ -1772,136 +1772,132 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32>
; SI-LABEL: v_vselect_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v16
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
-; SI-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7]
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v17, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36
+; SI-NEXT: v_cndmask_b32_e32 v0, v37, v0, vcc
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38
+; SI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v18
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v18, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
-; SI-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9]
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v18
-; SI-NEXT: v_cvt_f16_f32_e32 v18, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
-; SI-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[10:11]
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v19
-; SI-NEXT: v_cvt_f16_f32_e32 v19, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
-; SI-NEXT: v_cndmask_b32_e64 v3, v16, v3, s[12:13]
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v20
-; SI-NEXT: v_cvt_f16_f32_e32 v20, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
-; SI-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[14:15]
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: v_cndmask_b32_e64 v5, v16, v5, s[4:5]
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v22
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v23
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v31
-; SI-NEXT: v_cndmask_b32_e64 v7, v16, v7, s[16:17]
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v25
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
-; SI-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v19
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v21
+; SI-NEXT: v_cndmask_b32_e32 v4, v18, v4, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v18, v22
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56
+; SI-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT: v_cndmask_b32_e32 v6, v21, v6, vcc
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; SI-NEXT: v_cndmask_b32_e32 v12, v18, v12, vcc
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v23, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v24, v25
+; SI-NEXT: v_cndmask_b32_e32 v7, v22, v7, vcc
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
+; SI-NEXT: v_cndmask_b32_e32 v8, v23, v8, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v23, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33
+; SI-NEXT: v_cndmask_b32_e32 v9, v24, v9, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v24, v27
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v19, v28
+; SI-NEXT: v_cndmask_b32_e32 v10, v23, v10, vcc
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v20, v29
+; SI-NEXT: v_cndmask_b32_e32 v11, v24, v11, vcc
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64
-; SI-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v30
+; SI-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT: v_cndmask_b32_e32 v14, v20, v14, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v18, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT: v_cndmask_b32_e32 v13, v20, v13, vcc
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; SI-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22
+; SI-NEXT: v_cndmask_b32_e32 v15, v18, v15, vcc
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_vselect_v16f16:
@@ -1912,25 +1908,22 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32>
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v31, s30, 0
; VI-NEXT: v_writelane_b32 v31, s31, 1
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v17
; VI-NEXT: v_cmp_eq_u32_e64 s[30:31], 0, v29
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v6
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v14
-; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
-; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v24
+; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18
; VI-NEXT: v_cmp_eq_u32_e64 s[28:29], 0, v27
; VI-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[30:31]
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5
; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13
; VI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v19
+; VI-NEXT: v_cmp_eq_u32_e64 s[26:27], 0, v25
; VI-NEXT: v_cndmask_b32_e64 v17, v18, v17, s[28:29]
; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4
; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12
-; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[10:11]
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32
-; VI-NEXT: v_cmp_eq_u32_e64 s[26:27], 0, v25
-; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v20
+; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v20
; VI-NEXT: v_cmp_eq_u32_e64 s[24:25], 0, v23
; VI-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[26:27]
; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3
@@ -1939,46 +1932,49 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32>
; VI-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[24:25]
; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v10
-; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v22
+; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v22
; VI-NEXT: v_cndmask_b32_e64 v20, v21, v20, s[22:23]
; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v9
; VI-NEXT: v_cndmask_b32_e64 v21, v22, v21, s[20:21]
-; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v21
; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v0
; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7]
+; VI-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; VI-NEXT: v_cndmask_b32_e64 v22, v23, v22, s[18:19]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
+; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v22
+; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[8:9]
+; VI-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; VI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v15
+; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v24
+; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[10:11]
+; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v26
+; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v28
+; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[14:15]
+; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[16:17]
+; VI-NEXT: v_readlane_b32 s31, v31, 1
+; VI-NEXT: v_readlane_b32 s30, v31, 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
+; VI-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v21
; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20
-; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v26
-; VI-NEXT: v_cndmask_b32_e64 v22, v23, v22, s[18:19]
-; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[8:9]
; VI-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v19
-; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13]
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v7
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v15
-; VI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v22
; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v18
-; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v28
-; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v30
-; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v17
-; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[14:15]
-; VI-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[16:17]
; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v16
-; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_readlane_b32 s31, v31, 1
-; VI-NEXT: v_readlane_b32 s30, v31, 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
-; VI-NEXT: v_cndmask_b32_e32 v8, v13, v11, vcc
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index fc6ad39db5b89f..a423b6f831a9d8 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -286,18 +286,18 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16
; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8
; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
-; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8
-; GCN-NEXT: v_or_b32_e32 v19, v19, v17
-; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v16
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
+; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
+; GCN-NEXT: v_or_b32_e32 v19, v19, v17
+; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
+; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9
; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5]
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
-; GCN-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9
; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12
@@ -335,18 +335,18 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16
; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8
; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
-; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8
-; GCN-NEXT: v_or_b32_e32 v19, v19, v17
-; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v16
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
+; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
+; GCN-NEXT: v_or_b32_e32 v19, v19, v17
+; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
+; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9
; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5]
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
-; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9
; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12
@@ -384,18 +384,18 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16
; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8
; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
-; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8
-; GCN-NEXT: v_or_b32_e32 v19, v19, v17
-; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v16
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
+; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
+; GCN-NEXT: v_or_b32_e32 v19, v19, v17
+; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
+; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9
; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5]
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
-; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9
; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 6b4bca11d80c78..7e7f4f5d19914b 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -911,20 +911,20 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
-; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
+; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
+; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
+; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshl_b64 v[6:7], v[6:7], v13
-; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], v11
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
-; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13
+; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: shl_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 9d550ec27a63bf..8150328dd24f03 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -605,20 +605,20 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
-; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
+; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
+; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v10
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6
+; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ashr_i64 v[6:7], v[6:7], v13
-; SI-NEXT: v_ashr_i64 v[4:5], v[4:5], v11
-; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v8
-; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13
+; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: ashr_v4i64:
@@ -631,20 +631,20 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: s_mov_b32 s9, s7
-; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
+; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
+; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
+; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_ashrrev_i64 v[2:3], v10, v[2:3]
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3]
+; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ashrrev_i64 v[6:7], v13, v[6:7]
-; VI-NEXT: v_ashrrev_i64 v[4:5], v11, v[4:5]
-; VI-NEXT: v_ashrrev_i64 v[0:1], v8, v[0:1]
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10]
+; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8]
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index ce15bbcc9e189b..6423267be4b34f 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -6117,108 +6117,108 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v11
; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v8
; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v11, v8, vcc
-; TONGA-NEXT: v_xor_b32_e32 v22, v9, v8
-; TONGA-NEXT: v_xor_b32_e32 v11, v11, v8
-; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v22
-; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v11
-; TONGA-NEXT: v_sub_u32_e32 v23, vcc, 0, v22
-; TONGA-NEXT: v_subb_u32_e32 v24, vcc, 0, v11, vcc
-; TONGA-NEXT: v_madmk_f32 v8, v9, 0x4f800000, v8
-; TONGA-NEXT: v_rcp_f32_e32 v8, v8
-; TONGA-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8
-; TONGA-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8
-; TONGA-NEXT: v_trunc_f32_e32 v9, v9
-; TONGA-NEXT: v_madmk_f32 v8, v9, 0xcf800000, v8
-; TONGA-NEXT: v_cvt_u32_f32_e32 v20, v9
-; TONGA-NEXT: v_cvt_u32_f32_e32 v21, v8
-; TONGA-NEXT: v_mul_lo_u32 v18, v23, v20
-; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v23, v21, 0
-; TONGA-NEXT: v_mul_lo_u32 v19, v24, v21
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v18
-; TONGA-NEXT: v_add_u32_e32 v25, vcc, v9, v19
-; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v21, v25, 0
-; TONGA-NEXT: v_mul_hi_u32 v9, v21, v8
-; TONGA-NEXT: v_add_u32_e32 v26, vcc, v9, v18
-; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v20, v8, 0
-; TONGA-NEXT: v_addc_u32_e32 v27, vcc, 0, v19, vcc
-; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v20, v25, 0
-; TONGA-NEXT: v_add_u32_e32 v8, vcc, v26, v8
-; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v27, v9, vcc
-; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v19, vcc
-; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v18
-; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; TONGA-NEXT: v_add_u32_e32 v25, vcc, v21, v8
-; TONGA-NEXT: v_addc_u32_e32 v26, vcc, v20, v9, vcc
-; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v23, v25, 0
-; TONGA-NEXT: v_mul_lo_u32 v20, v23, v26
-; TONGA-NEXT: v_mul_lo_u32 v21, v24, v25
-; TONGA-NEXT: v_mul_hi_u32 v23, v25, v8
-; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v26, v8, 0
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, v20, v9
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v21
-; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v9, 0
-; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v26, v9, 0
-; TONGA-NEXT: v_add_u32_e32 v20, vcc, v23, v20
-; TONGA-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
+; TONGA-NEXT: v_xor_b32_e32 v9, v9, v8
+; TONGA-NEXT: v_xor_b32_e32 v8, v11, v8
+; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v9
+; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v8
+; TONGA-NEXT: v_sub_u32_e32 v23, vcc, 0, v9
+; TONGA-NEXT: v_subb_u32_e32 v24, vcc, 0, v8, vcc
+; TONGA-NEXT: v_madmk_f32 v11, v18, 0x4f800000, v11
+; TONGA-NEXT: v_rcp_f32_e32 v11, v11
+; TONGA-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11
+; TONGA-NEXT: v_mul_f32_e32 v18, 0x2f800000, v11
+; TONGA-NEXT: v_trunc_f32_e32 v18, v18
+; TONGA-NEXT: v_madmk_f32 v11, v18, 0xcf800000, v11
+; TONGA-NEXT: v_cvt_u32_f32_e32 v22, v18
+; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v11
+; TONGA-NEXT: v_mul_lo_u32 v20, v23, v22
+; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0
+; TONGA-NEXT: v_mul_lo_u32 v21, v24, v11
+; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v20
+; TONGA-NEXT: v_add_u32_e32 v21, vcc, v19, v21
+; TONGA-NEXT: v_mad_u64_u32 v[19:20], s[0:1], v11, v21, 0
+; TONGA-NEXT: v_mul_hi_u32 v25, v11, v18
+; TONGA-NEXT: v_add_u32_e32 v25, vcc, v25, v19
+; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v22, v18, 0
+; TONGA-NEXT: v_addc_u32_e32 v26, vcc, 0, v20, vcc
+; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v22, v21, 0
+; TONGA-NEXT: v_add_u32_e32 v18, vcc, v25, v18
+; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v26, v19, vcc
+; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v21, vcc
+; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v20
+; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc
+; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18
+; TONGA-NEXT: v_addc_u32_e32 v25, vcc, v22, v19, vcc
+; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0
+; TONGA-NEXT: v_mul_lo_u32 v22, v23, v25
+; TONGA-NEXT: v_mul_lo_u32 v23, v24, v11
+; TONGA-NEXT: v_mul_hi_u32 v24, v11, v18
+; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v18, 0
+; TONGA-NEXT: v_add_u32_e32 v19, vcc, v22, v19
+; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v23
+; TONGA-NEXT: v_mad_u64_u32 v[22:23], s[0:1], v11, v19, 0
+; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v25, v19, 0
+; TONGA-NEXT: v_add_u32_e32 v22, vcc, v24, v22
+; TONGA-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc
+; TONGA-NEXT: v_add_u32_e32 v20, vcc, v22, v20
+; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v23, v21, vcc
+; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc
; TONGA-NEXT: v_add_u32_e32 v18, vcc, v20, v18
-; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v21, v19, vcc
-; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; TONGA-NEXT: v_add_u32_e32 v8, vcc, v18, v8
-; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; TONGA-NEXT: v_add_u32_e32 v18, vcc, v25, v8
-; TONGA-NEXT: v_addc_u32_e32 v19, vcc, v26, v9, vcc
-; TONGA-NEXT: v_ashrrev_i32_e32 v20, 31, v15
-; TONGA-NEXT: v_add_u32_e32 v8, vcc, v14, v20
-; TONGA-NEXT: v_xor_b32_e32 v21, v8, v20
-; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v21, v19, 0
-; TONGA-NEXT: v_mul_hi_u32 v23, v21, v18
-; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v20, vcc
-; TONGA-NEXT: v_xor_b32_e32 v15, v15, v20
-; TONGA-NEXT: v_add_u32_e32 v23, vcc, v23, v8
-; TONGA-NEXT: v_addc_u32_e32 v24, vcc, 0, v9, vcc
-; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v15, v18, 0
-; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v19, 0
-; TONGA-NEXT: v_add_u32_e32 v8, vcc, v23, v8
-; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v24, v9, vcc
-; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v19, vcc
-; TONGA-NEXT: v_add_u32_e32 v18, vcc, v8, v18
-; TONGA-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; TONGA-NEXT: v_mul_lo_u32 v19, v22, v8
-; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v22, v18, 0
-; TONGA-NEXT: v_mul_lo_u32 v18, v11, v18
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, v19, v9
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, v18, v9
-; TONGA-NEXT: v_sub_u32_e32 v18, vcc, v15, v9
-; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v21, v8
-; TONGA-NEXT: v_subb_u32_e64 v18, s[0:1], v18, v11, vcc
-; TONGA-NEXT: v_sub_u32_e64 v19, s[0:1], v8, v22
-; TONGA-NEXT: v_subbrev_u32_e64 v21, s[2:3], 0, v18, s[0:1]
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v21, v11
+; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc
+; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18
+; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v25, v19, vcc
+; TONGA-NEXT: v_ashrrev_i32_e32 v22, 31, v15
+; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v22
+; TONGA-NEXT: v_xor_b32_e32 v23, v18, v22
+; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v20, 0
+; TONGA-NEXT: v_mul_hi_u32 v21, v23, v11
+; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v22, vcc
+; TONGA-NEXT: v_xor_b32_e32 v15, v15, v22
+; TONGA-NEXT: v_add_u32_e32 v24, vcc, v21, v18
+; TONGA-NEXT: v_addc_u32_e32 v25, vcc, 0, v19, vcc
+; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v11, 0
+; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v15, v20, 0
+; TONGA-NEXT: v_add_u32_e32 v11, vcc, v24, v18
+; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v25, v19, vcc
+; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v21, vcc
+; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v20
+; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc
+; TONGA-NEXT: v_mul_lo_u32 v20, v9, v18
+; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v11, 0
+; TONGA-NEXT: v_mul_lo_u32 v11, v8, v11
+; TONGA-NEXT: v_add_u32_e32 v19, vcc, v20, v19
+; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v19
+; TONGA-NEXT: v_sub_u32_e32 v19, vcc, v15, v11
+; TONGA-NEXT: v_sub_u32_e32 v18, vcc, v23, v18
+; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, vcc
+; TONGA-NEXT: v_sub_u32_e64 v20, s[0:1], v18, v9
+; TONGA-NEXT: v_subbrev_u32_e64 v21, s[2:3], 0, v19, s[0:1]
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v21, v8
; TONGA-NEXT: v_cndmask_b32_e64 v23, 0, -1, s[2:3]
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v19, v22
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v20, v9
+; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v15, v11, vcc
; TONGA-NEXT: v_cndmask_b32_e64 v24, 0, -1, s[2:3]
-; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v21, v11
-; TONGA-NEXT: v_subb_u32_e64 v18, s[0:1], v18, v11, s[0:1]
+; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v21, v8
+; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, s[0:1]
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8
; TONGA-NEXT: v_cndmask_b32_e64 v23, v23, v24, s[2:3]
-; TONGA-NEXT: v_sub_u32_e64 v24, s[0:1], v19, v22
-; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v15, v9, vcc
-; TONGA-NEXT: v_subbrev_u32_e64 v18, s[0:1], 0, v18, s[0:1]
-; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v9, v11
-; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v23
+; TONGA-NEXT: v_sub_u32_e64 v24, s[0:1], v20, v9
; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc
-; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v22
-; TONGA-NEXT: v_cndmask_b32_e64 v18, v21, v18, s[0:1]
-; TONGA-NEXT: v_cndmask_b32_e64 v21, 0, -1, vcc
-; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v9, v11
-; TONGA-NEXT: v_cndmask_b32_e32 v11, v15, v21, vcc
-; TONGA-NEXT: v_cndmask_b32_e64 v19, v19, v24, s[0:1]
-; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v19, vcc
-; TONGA-NEXT: v_cndmask_b32_e32 v9, v9, v18, vcc
-; TONGA-NEXT: v_xor_b32_e32 v8, v8, v20
-; TONGA-NEXT: v_xor_b32_e32 v9, v9, v20
-; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v8, v20
-; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v9, v20, vcc
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v18, v9
+; TONGA-NEXT: v_subbrev_u32_e64 v19, s[0:1], 0, v19, s[0:1]
+; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8
+; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v23
+; TONGA-NEXT: v_cndmask_b32_e32 v8, v15, v9, vcc
+; TONGA-NEXT: v_cndmask_b32_e64 v20, v20, v24, s[0:1]
+; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; TONGA-NEXT: v_cndmask_b32_e64 v19, v21, v19, s[0:1]
+; TONGA-NEXT: v_cndmask_b32_e32 v9, v18, v20, vcc
+; TONGA-NEXT: v_cndmask_b32_e32 v8, v11, v19, vcc
+; TONGA-NEXT: v_xor_b32_e32 v9, v9, v22
+; TONGA-NEXT: v_xor_b32_e32 v11, v8, v22
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v9, v22
+; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v11, v22, vcc
; TONGA-NEXT: s_cbranch_execnz .LBB12_3
; TONGA-NEXT: .LBB12_2:
; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v10
@@ -8991,33 +8991,33 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1)
; TONGA-NEXT: s_waitcnt vmcnt(1)
; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v1
; TONGA-NEXT: v_lshrrev_b32_e32 v12, 30, v12
-; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v3
; TONGA-NEXT: v_add_u32_e32 v12, vcc, v0, v12
-; TONGA-NEXT: v_lshrrev_b32_e32 v13, 30, v13
; TONGA-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc
+; TONGA-NEXT: v_and_b32_e32 v12, -4, v12
+; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12
+; TONGA-NEXT: v_lshrrev_b32_e32 v13, 30, v13
+; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc
+; TONGA-NEXT: v_add_u32_e32 v12, vcc, v2, v13
+; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc
+; TONGA-NEXT: v_and_b32_e32 v12, -4, v12
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v5
-; TONGA-NEXT: v_add_u32_e32 v13, vcc, v2, v13
-; TONGA-NEXT: v_lshrrev_b32_e32 v14, 30, v14
-; TONGA-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc
; TONGA-NEXT: v_ashrrev_i32_e32 v15, 31, v7
-; TONGA-NEXT: v_add_u32_e32 v14, vcc, v4, v14
+; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v12
+; TONGA-NEXT: v_lshrrev_b32_e32 v14, 30, v14
; TONGA-NEXT: v_lshrrev_b32_e32 v15, 30, v15
-; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v5, vcc
-; TONGA-NEXT: v_add_u32_e32 v15, vcc, v6, v15
-; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v7, vcc
+; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
+; TONGA-NEXT: v_add_u32_e64 v12, s[0:1], v4, v14
+; TONGA-NEXT: v_add_u32_e32 v13, vcc, v6, v15
+; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v7, vcc
; TONGA-NEXT: v_and_b32_e32 v12, -4, v12
-; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12
+; TONGA-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v5, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v12
; TONGA-NEXT: v_and_b32_e32 v13, -4, v13
-; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc
-; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v13
-; TONGA-NEXT: v_and_b32_e32 v14, -4, v14
-; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v17, vcc
-; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v14
-; TONGA-NEXT: v_and_b32_e32 v15, -4, v15
-; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v18, vcc
-; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v15
-; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v19, vcc
+; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v14, vcc
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v13
+; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc
; TONGA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; TONGA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; TONGA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 29488579c15537..a9b1f7e888567f 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -266,20 +266,20 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
-; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
+; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
+; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v10
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v6
+; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], v13
-; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v11
-; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v8
-; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13
+; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: lshr_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 6ed19bd6d764b8..30a0a26ca173e0 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -776,14 +776,14 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v4, v20
+; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4
-; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v17
+; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4
+; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16
; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5
@@ -895,14 +895,14 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v4, v20
+; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4
-; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v17
+; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4
+; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16
; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 7c310477dd838f..530226baa775e1 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -862,43 +862,43 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_mul_lo_u32 v14, v10, v0
; GCN-NEXT: v_mul_lo_u32 v16, v11, v1
; GCN-NEXT: v_mul_lo_u32 v18, v12, v2
-; GCN-NEXT: v_mul_lo_u32 v20, v13, v3
+; GCN-NEXT: v_mul_lo_u32 v19, v13, v3
; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v14
; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v16
; GCN-NEXT: v_sub_u32_e32 v8, vcc, v8, v18
-; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v20
+; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v19
; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10
; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11
-; GCN-NEXT: v_add_u32_e32 v19, vcc, 1, v12
-; GCN-NEXT: v_add_u32_e32 v21, vcc, 1, v13
+; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12
+; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v6, v0
; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v7, v1
; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3
-; GCN-NEXT: v_sub_u32_e32 v14, vcc, v6, v0
+; GCN-NEXT: v_sub_u32_e32 v18, vcc, v6, v0
; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1]
; GCN-NEXT: v_sub_u32_e32 v15, vcc, v7, v1
; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3]
-; GCN-NEXT: v_sub_u32_e32 v16, vcc, v8, v2
-; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5]
-; GCN-NEXT: v_sub_u32_e32 v17, vcc, v9, v3
-; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1]
-; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10
+; GCN-NEXT: v_sub_u32_e32 v17, vcc, v8, v2
+; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5]
+; GCN-NEXT: v_sub_u32_e32 v14, vcc, v9, v3
+; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v18, s[0:1]
+; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v10
; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[2:3]
; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11
-; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5]
-; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v12
-; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[6:7]
-; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v13
+; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[4:5]
+; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v12
+; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7]
+; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v13
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v14, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v16, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v16, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3
-; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
>From 28600c025ed5a5894e84ad23d505c3192d30d9da Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Tue, 21 Jan 2025 16:12:04 +0100
Subject: [PATCH 2/5] Fix formatting issues
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 4 ++--
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index dd6ab64925e50e..031d8f0560ff25 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1272,8 +1272,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}
ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
- STM.computeOccupancy(F, ProgInfo.LDSSize).second, ProgInfo.NumSGPRsForWavesPerEU,
- ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
+ STM.computeOccupancy(F, ProgInfo.LDSSize).second,
+ ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 22a550450dc2eb..fe9da7b7b505f6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -411,7 +411,7 @@ GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);
-
+
// Maximum occupancy may be further limited by high SGPR/VGPR usage.
MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
return {std::min(MinOcc, MaxOcc), MaxOcc};
>From a191fec2b680a180d638415f967940cd7aa943f9 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Tue, 21 Jan 2025 17:53:32 +0100
Subject: [PATCH 3/5] Fix failing MIR tests
---
...ine-function-info-long-branch-reg-debug.ll | 2 +-
.../machine-function-info-long-branch-reg.ll | 2 +-
.../AMDGPU/machine-function-info-no-ir.mir | 20 +++++++++----------
.../MIR/AMDGPU/machine-function-info.ll | 4 ++--
4 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index d1d8240a1007a2..883657547519ba 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -39,7 +39,7 @@
; CHECK-NEXT: fp64-fp16-input-denormals: true
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: BitsOf32BitAddress: 0
-; CHECK-NEXT: occupancy: 8
+; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index ad6e92a25b8615..278bf086d6088b 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -39,7 +39,7 @@
; CHECK-NEXT: fp64-fp16-input-denormals: true
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: BitsOf32BitAddress: 0
-; CHECK-NEXT: occupancy: 8
+; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 3eff89239d5418..89d831b51f6947 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -153,7 +153,7 @@ body: |
# FULL-NEXT: fp64-fp16-input-denormals: true
# FULL-NEXT: fp64-fp16-output-denormals: true
# FULL-NEXT: highBitsOf32BitAddress: 0
-# FULL-NEXT: occupancy: 8
+# FULL-NEXT: occupancy: 10
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
@@ -175,7 +175,7 @@ body: |
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
-# SIMPLE-NEXT: occupancy: 8
+# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: body:
name: no_mfi
@@ -229,7 +229,7 @@ body: |
# FULL-NEXT: fp64-fp16-input-denormals: true
# FULL-NEXT: fp64-fp16-output-denormals: true
# FULL-NEXT: highBitsOf32BitAddress: 0
-# FULL-NEXT: occupancy: 8
+# FULL-NEXT: occupancy: 10
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
@@ -251,7 +251,7 @@ body: |
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
-# SIMPLE-NEXT: occupancy: 8
+# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: body:
name: empty_mfi
@@ -306,7 +306,7 @@ body: |
# FULL-NEXT: fp64-fp16-input-denormals: true
# FULL-NEXT: fp64-fp16-output-denormals: true
# FULL-NEXT: highBitsOf32BitAddress: 0
-# FULL-NEXT: occupancy: 8
+# FULL-NEXT: occupancy: 10
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
@@ -329,7 +329,7 @@ body: |
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
-# SIMPLE-NEXT: occupancy: 8
+# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: body:
name: empty_mfi_entry_func
@@ -457,11 +457,11 @@ body: |
...
---
-# ALL-LABEL: name: occupancy_0
-# ALL: occupancy: 8
-name: occupancy_0
+# ALL-LABEL: name: occupancy_10
+# ALL: occupancy: 10
+name: occupancy_10
machineFunctionInfo:
- occupancy: 0
+ occupancy: 10
body: |
bb.0:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index eca3f99b64955b..ec56de11b250a4 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -167,7 +167,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: fp64-fp16-input-denormals: true
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: highBitsOf32BitAddress: 0
-; CHECK-NEXT: occupancy: 8
+; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
@@ -220,7 +220,7 @@ define void @function() {
; CHECK-NEXT: fp64-fp16-input-denormals: true
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: highBitsOf32BitAddress: 0
-; CHECK-NEXT: occupancy: 8
+; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
>From 7d3f9449c362f3c48ca1c083c5fb1b9fc0b6c8cb Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Wed, 22 Jan 2025 16:02:38 +0100
Subject: [PATCH 4/5] Address feedback
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 14 +---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +-
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 24 +++---
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 11 ++-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +
.../AMDGPU/agpr-copy-no-free-registers.ll | 73 +++++++++++--------
7 files changed, 74 insertions(+), 56 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 031d8f0560ff25..972994117ee232 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1175,22 +1175,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// Make clamp modifier on NaN input returns 0.
ProgInfo.DX10Clamp = Mode.DX10Clamp;
- unsigned LDSAlignShift;
- if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
- // LDS is allocated in 320 dword blocks.
- LDSAlignShift = 11;
- } else if (STM.getFeatureBits().test(
- FeatureAddressableLocalMemorySize65536)) {
- // LDS is allocated in 128 dword blocks.
- LDSAlignShift = 9;
- } else {
- // LDS is allocated in 64 dword blocks.
- LDSAlignShift = 8;
- }
-
ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
+ unsigned LDSAlignShift = Log2_32_Ceil(STM.getLDSAllocGranularity());
ProgInfo.LDSSize = MFI->getLDSSize();
ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 907f82ed7fc528..26c65f1e64965d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -1344,7 +1344,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
}
unsigned MaxOccupancy =
- ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second;
+ ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F, TM).second;
// Restrict local memory usage so that we don't drastically reduce occupancy,
// unless it is already significantly reduced.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index da729d4dc7e089..b427174edf552d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -55,13 +55,15 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
return getLocalMemorySize() / WorkGroupsPerCU;
}
-std::pair<unsigned, unsigned>
-AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
- const Function &F) const {
- // FIXME: Is there an allocation granularity for the LDS? If so we would need
- // to make sure the amount of bytes is aligned on that granularity.
-
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
+ uint32_t LDSBytes, const Function &F, const TargetMachine &TM) const {
// Compute occupancy restriction based on LDS usage.
+ if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+ // For GCN subtargets, LDS size must be aligned on allocation granularity.
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ LDSBytes = alignTo(LDSBytes, ST.getLDSAllocGranularity());
+ }
+
const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
// Queried LDS size may be larger than available on a CU, in which case we
@@ -72,9 +74,8 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
return {1, 1};
const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
- const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
- auto PropsFromWGSize = [&](unsigned WGSize)
+ auto PropsFromWGSize = [=](unsigned WGSize)
-> std::tuple<const unsigned, const unsigned, unsigned> {
unsigned WavesPerWG = divideCeil(WGSize, WaveSize);
unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS);
@@ -91,10 +92,12 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
// It is possible that we end up with flipped minimum and maximum number of
// waves per CU when the number of minimum/maximum concurrent groups on the CU
- // is limited by LDS usage or barrier ressources.
+ // is limited by LDS usage or barrier resources.
if (MinWavesPerCU >= MaxWavesPerCU) {
std::swap(MinWavesPerCU, MaxWavesPerCU);
} else {
+ const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
+
// Look for a potential smaller group size than the maximum which decreases
// the concurrent number of waves on the CU for the same number of
// concurrent workgroups on the CU.
@@ -140,7 +143,8 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
const MachineFunction &MF) const {
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
- return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction());
+ return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction(),
+ MF.getTarget());
}
std::pair<unsigned, unsigned>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 5944b69ce64162..78d2d1041744f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -133,7 +133,8 @@ class AMDGPUSubtarget {
/// This notably depends on the range of allowed flat group sizes for the
/// function and hardware characteristics.
std::pair<unsigned, unsigned>
- getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;
+ getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F,
+ const TargetMachine &TM) const;
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p MF. This notably
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index fe9da7b7b505f6..737034e59686ed 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -408,7 +408,8 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
std::pair<unsigned, unsigned>
GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs, unsigned NumVGPRs) const {
- auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
+ auto [MinOcc, MaxOcc] =
+ getOccupancyWithWorkGroupSizes(LDSSize, F, TLInfo.getTargetMachine());
unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);
@@ -417,6 +418,14 @@ GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
return {std::min(MinOcc, MaxOcc), MaxOcc};
}
+unsigned GCNSubtarget::getLDSAllocGranularity() const {
+ if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize163840))
+ return 1280; // LDS is allocated in 320 dword blocks.
+ if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize65536))
+ return 512; // LDS is allocated in 128 dword blocks.
+ return 256; // LDS is allocated in 64 dword blocks.
+}
+
unsigned GCNSubtarget::getBaseMaxNumSGPRs(
const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a22e413508021d..542aba027ae085 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1381,6 +1381,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
unsigned NumSGPRs = 0,
unsigned NumVGPRs = 0) const;
+ /// Returns the LDS's allocation granularity in bytes.
+ unsigned getLDSAllocGranularity() const;
+
/// \returns true if the flat_scratch register should be initialized with the
/// pointer to the wave's scratch memory rather than a size and offset.
bool flatScratchIsPointer() const {
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 58bb4ef5789ec2..4ce46bbaf45ac1 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -365,7 +365,10 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 {
ret void
}
-define void @v32_asm_def_use(float %v0, float %v1) #0 {
+; FIXME: This case is broken. The asm value passed in v32 is live
+; through the range where the reserved def for the copy is introduced,
+; clobbering the user value.
+define void @v32_asm_def_use(float %v0, float %v1) #4 {
; GFX908-LABEL: v32_asm_def_use:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -374,48 +377,57 @@ define void @v32_asm_def_use(float %v0, float %v1) #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def v[0:31] a[0:15]
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a15
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def v32
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a14
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_write_b32 a31, v32
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a13
+; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a30, v35
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a13
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v35
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a12
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a11
-; GFX908-NEXT: v_accvgpr_write_b32 a29, v32
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a12
+; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a27, v35
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a10
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v35
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a9
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a8
-; GFX908-NEXT: v_accvgpr_write_b32 a28, v32
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a10
+; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a24, v35
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v35
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a6
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a5
-; GFX908-NEXT: v_accvgpr_write_b32 a26, v32
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a9
+; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a21, v35
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a2
-; GFX908-NEXT: v_accvgpr_write_b32 a25, v32
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a7
-; GFX908-NEXT: v_accvgpr_write_b32 a18, v35
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_write_b32 a23, v32
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a4
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a22, v32
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a4
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v35
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a3
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a20, v32
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a3
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v35
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a2
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a19, v32
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a1
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v35
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a1
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a17, v32
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v35
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a0
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a16, v32
-; GFX908-NEXT: ;;#ASMSTART
-; GFX908-NEXT: ; def v32
-; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v35
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
@@ -1133,3 +1145,4 @@ attributes #0 = { "amdgpu-waves-per-eu"="6,6" }
attributes #1 = { convergent nounwind readnone willreturn }
attributes #2 = { nounwind readnone willreturn }
attributes #3 = { "amdgpu-waves-per-eu"="7,7" }
+attributes #4 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="1024,1024" }
>From 260463f33362377b21d12c0725658038f9eed8a6 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 23 Jan 2025 14:45:05 +0100
Subject: [PATCH 5/5] Revert changes related to LDS alloc. granularity
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 13 ++++++++++++-
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +-
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 16 +++++-----------
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +--
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 11 +----------
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 ---
6 files changed, 20 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 972994117ee232..906dd3142ff5b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1178,7 +1178,18 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
- unsigned LDSAlignShift = Log2_32_Ceil(STM.getLDSAllocGranularity());
+ unsigned LDSAlignShift;
+ if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+ // LDS is allocated in 320 dword blocks.
+ LDSAlignShift = 11;
+ } else if (STM.getFeatureBits().test(
+ FeatureAddressableLocalMemorySize65536)) {
+ // LDS is allocated in 128 dword blocks.
+ LDSAlignShift = 9;
+ } else {
+ // LDS is allocated in 64 dword blocks.
+ LDSAlignShift = 8;
+ }
ProgInfo.LDSSize = MFI->getLDSSize();
ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 26c65f1e64965d..907f82ed7fc528 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -1344,7 +1344,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
}
unsigned MaxOccupancy =
- ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F, TM).second;
+ ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second;
// Restrict local memory usage so that we don't drastically reduce occupancy,
// unless it is already significantly reduced.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index b427174edf552d..d98a0ffcaf7e38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -55,15 +55,10 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
return getLocalMemorySize() / WorkGroupsPerCU;
}
-std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
- uint32_t LDSBytes, const Function &F, const TargetMachine &TM) const {
- // Compute occupancy restriction based on LDS usage.
- if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
- // For GCN subtargets, LDS size must be aligned on allocation granularity.
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- LDSBytes = alignTo(LDSBytes, ST.getLDSAllocGranularity());
- }
-
+std::pair<unsigned, unsigned>
+AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
+ const Function &F) const {
+ // FIXME: We should take into account the LDS allocation granularity.
const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
// Queried LDS size may be larger than available on a CU, in which case we
@@ -143,8 +138,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
const MachineFunction &MF) const {
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
- return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction(),
- MF.getTarget());
+ return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction());
}
std::pair<unsigned, unsigned>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 78d2d1041744f9..5944b69ce64162 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -133,8 +133,7 @@ class AMDGPUSubtarget {
/// This notably depends on the range of allowed flat group sizes for the
/// function and hardware characteristics.
std::pair<unsigned, unsigned>
- getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F,
- const TargetMachine &TM) const;
+ getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p MF. This notably
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 737034e59686ed..fe9da7b7b505f6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -408,8 +408,7 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
std::pair<unsigned, unsigned>
GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs, unsigned NumVGPRs) const {
- auto [MinOcc, MaxOcc] =
- getOccupancyWithWorkGroupSizes(LDSSize, F, TLInfo.getTargetMachine());
+ auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);
@@ -418,14 +417,6 @@ GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
return {std::min(MinOcc, MaxOcc), MaxOcc};
}
-unsigned GCNSubtarget::getLDSAllocGranularity() const {
- if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize163840))
- return 1280; // LDS is allocated in 320 dword blocks.
- if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize65536))
- return 512; // LDS is allocated in 128 dword blocks.
- return 256; // LDS is allocated in 64 dword blocks.
-}
-
unsigned GCNSubtarget::getBaseMaxNumSGPRs(
const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 542aba027ae085..a22e413508021d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1381,9 +1381,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
unsigned NumSGPRs = 0,
unsigned NumVGPRs = 0) const;
- /// Returns the LDS's allocation granularity in bytes.
- unsigned getLDSAllocGranularity() const;
-
/// \returns true if the flat_scratch register should be initialized with the
/// pointer to the wave's scratch memory rather than a size and offset.
bool flatScratchIsPointer() const {
More information about the llvm-commits
mailing list